@@ -24,6 +24,18 @@ Description:
respectively) whether or not this OCC has limited the processor
frequency due to power usage.
+What: /sys/bus/platform/drivers/occ-hwmon/<dev>/occ_error
+Date: June 2017
+KernelVersion: 4.14
+Contact: eajames@us.ibm.com
+Description:
+ A read-only attribute that indicates any error condition
+ observed by the OCC or detected by the driver. Reading the
+ attribute will return an integer. A positive integer indicates
+ an error response from the OCC. A negative integer indicates a
+ possible bus error or other error condition detected by the
+ driver. A "0" indicates no error.
+
What: /sys/bus/platform/drivers/occ-hwmon/<dev>/occ_master
Date: June 2017
KernelVersion: 4.14
@@ -11,6 +11,9 @@
#include "common.h"
#include <linux/hwmon.h>
+/* counter so we can verify against count from OCC response */
+static atomic_t occ_num_occs = ATOMIC_INIT(0);
+
/* OCC sensor type and version definitions */
struct temp_sensor_1 {
@@ -112,6 +115,9 @@ struct extended_sensor {
static int occ_poll(struct occ *occ)
{
+ int rc;
+ struct occ_poll_response_header *header =
+ (struct occ_poll_response_header *)occ->resp.data;
u16 checksum = occ->poll_cmd_data + 1;
u8 cmd[8];
@@ -126,7 +132,32 @@ static int occ_poll(struct occ *occ)
cmd[7] = 0;
/* mutex should already be locked if necessary */
- return occ->send_cmd(occ, cmd);
+ rc = occ->send_cmd(occ, cmd);
+ if (rc < 0)
+ return rc;
+
+ /* check for "safe" state */
+ if (header->occ_state == OCC_STATE_SAFE) {
+ if (occ->last_safe) {
+ if (time_after(jiffies,
+ occ->last_safe + OCC_SAFE_TIMEOUT))
+ occ->error = -EHOSTDOWN;
+ } else
+ occ->last_safe = jiffies;
+ } else
+ occ->last_safe = 0;
+
+ /* verify number of present OCCs */
+ if (header->status & OCC_STAT_MASTER) {
+ if (hweight8(header->occs_present) !=
+ atomic_read(&occ_num_occs)) {
+ occ->error = -EXDEV;
+ occ->bad_present_count++;
+ } else
+ occ->bad_present_count = 0;
+ }
+
+ return rc;
}
static int occ_set_user_power_cap(struct occ *occ, u16 user_power_cap)
@@ -993,6 +1024,19 @@ static int occ_setup_sensor_attrs(struct occ *occ)
return 0;
}
+static ssize_t occ_show_error(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int error = 0;
+ struct occ *occ = dev_get_drvdata(dev);
+
+ if (occ->error_count > OCC_ERROR_COUNT_THRESHOLD || occ->last_safe ||
+ occ->bad_present_count > OCC_ERROR_COUNT_THRESHOLD)
+ error = occ->error;
+
+ return snprintf(buf, PAGE_SIZE - 1, "%d\n", error);
+}
+
static ssize_t occ_show_status(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -1078,6 +1122,10 @@ static int occ_create_status_attrs(struct occ *occ)
(struct sensor_device_attribute)SENSOR_ATTR(occ_status, 0444,
occ_show_status,
NULL, 6);
+ occ->status_attrs[7] =
+ (struct sensor_device_attribute)SENSOR_ATTR(occ_error, 0444,
+ occ_show_error,
+ NULL, 0);
for (i = 0; i < OCC_NUM_STATUS_ATTRS; ++i) {
rc = device_create_file(dev, &occ->status_attrs[i].dev_attr);
@@ -1140,6 +1188,7 @@ int occ_setup(struct occ *occ, const char *name)
{
int rc;
+ atomic_inc(&occ_num_occs);
mutex_init(&occ->lock);
occ->groups[0] = &occ->group;
@@ -1187,5 +1236,7 @@ int occ_shutdown(struct occ *occ)
device_remove_file(occ->bus_dev,
&occ->status_attrs[i].dev_attr);
+ atomic_dec(&occ_num_occs);
+
return 0;
}
@@ -13,10 +13,13 @@
#include <linux/hwmon-sysfs.h>
#include <linux/sysfs.h>
-#define OCC_NUM_STATUS_ATTRS 7
+#define OCC_ERROR_COUNT_THRESHOLD 2
+
+#define OCC_NUM_STATUS_ATTRS 8
#define OCC_RESP_DATA_BYTES 4089
+#define OCC_SAFE_TIMEOUT msecs_to_jiffies(60000) /* 1 min */
#define OCC_UPDATE_FREQUENCY msecs_to_jiffies(1000)
#define OCC_TIMEOUT_MS 5000
#define OCC_CMD_IN_PRG_MS 100
@@ -39,6 +42,9 @@
#define OCC_EXT_STAT_MEM_THROTTLE 0x20
#define OCC_EXT_STAT_QUICK_DROP 0x10
+/* OCC state enumeration */
+#define OCC_STATE_SAFE 4
+
/* Same response format for all OCC versions.
* Allocate the largest possible response.
*/
@@ -132,6 +138,11 @@ struct occ {
/* non-hwmon attributes for more OCC properties */
struct sensor_device_attribute *status_attrs;
+
+ int error;
+ unsigned int error_count; /* num errors observed */
+ unsigned int bad_present_count; /* num polls w/bad num occs */
+ unsigned long last_safe; /* time entered safe state */
};
int occ_setup(struct occ *occ, const char *name);
@@ -161,7 +161,10 @@ static int p8_i2c_occ_send_cmd(struct occ *occ, u8 *cmd)
rc = -EFAULT;
}
+ occ->error = resp->return_status;
+
if (rc < 0) {
+ occ->error_count++;
dev_warn(&client->dev, "occ bad response: %d\n",
resp->return_status);
return rc;
@@ -169,9 +172,11 @@ static int p8_i2c_occ_send_cmd(struct occ *occ, u8 *cmd)
data_length = get_unaligned_be16(&resp->data_length_be);
if (data_length > OCC_RESP_DATA_BYTES) {
+ occ->error_count++;
+ occ->error = -EDOM;
dev_warn(&client->dev, "occ bad data length: %d\n",
data_length);
- return -EDOM;
+ return occ->error;
}
/* read remaining response */
@@ -181,9 +186,12 @@ static int p8_i2c_occ_send_cmd(struct occ *occ, u8 *cmd)
goto err;
}
+ occ->error_count = 0;
return data_length + 7;
err:
+ occ->error_count++;
+ occ->error = rc;
dev_err(&client->dev, "i2c scom op failed rc: %d\n", rc);
return rc;
}
@@ -9,6 +9,7 @@
#include "common.h"
#include <linux/init.h>
+#include <linux/hwmon.h>
#include <linux/module.h>
#include <linux/platform_device.h>
#include <linux/occ.h>
@@ -33,7 +34,7 @@ static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd)
retry:
client = occ_drv_open(p9_sbe_occ->sbe, 0);
if (!client)
- return -ENODEV;
+ return -ENODEV; /* don't increment error counter */
/* skip first byte (sequence number), OCC driver handles it */
rc = occ_drv_write(client, (const char *)&cmd[1], 7);
@@ -75,15 +76,21 @@ static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd)
rc = -EFAULT;
}
+ occ->error = resp->return_status;
+
if (rc < 0) {
+ occ->error_count++;
dev_warn(occ->bus_dev, "occ bad response: %d\n",
resp->return_status);
return rc;
}
+ occ->error_count = 0;
return 0;
err:
+ occ->error_count++;
+ occ->error = rc;
occ_drv_release(client);
dev_err(occ->bus_dev, "occ bus op failed rc: %d\n", rc);
return rc;