Message ID | 20180530001954.12000-2-saeedm@mellanox.com (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
On Tue, May 29, 2018 at 05:19:53PM -0700, Saeed Mahameed wrote: > From: Ilan Tayari <ilant@mellanox.com> > > Temperature warning event is sent by FW to indicate high temperature > as detected by one of the sensors on the board. > Add handling of this event by writing the numbers of the alert sensors > to the kernel log. Hi Saaed Is the temperature itself available? If so, it would be better to expose this as a hwmon device per temperature sensor. Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
T24gV2VkLCAyMDE4LTA1LTMwIGF0IDAzOjA0ICswMjAwLCBBbmRyZXcgTHVubiB3cm90ZToNCj4g T24gVHVlLCBNYXkgMjksIDIwMTggYXQgMDU6MTk6NTNQTSAtMDcwMCwgU2FlZWQgTWFoYW1lZWQg d3JvdGU6DQo+ID4gRnJvbTogSWxhbiBUYXlhcmkgPGlsYW50QG1lbGxhbm94LmNvbT4NCj4gPiAN Cj4gPiBUZW1wZXJhdHVyZSB3YXJuaW5nIGV2ZW50IGlzIHNlbnQgYnkgRlcgdG8gaW5kaWNhdGUg aGlnaA0KPiA+IHRlbXBlcmF0dXJlDQo+ID4gYXMgZGV0ZWN0ZWQgYnkgb25lIG9mIHRoZSBzZW5z b3JzIG9uIHRoZSBib2FyZC4NCj4gPiBBZGQgaGFuZGxpbmcgb2YgdGhpcyBldmVudCBieSB3cml0 aW5nIHRoZSBudW1iZXJzIG9mIHRoZSBhbGVydA0KPiA+IHNlbnNvcnMNCj4gPiB0byB0aGUga2Vy bmVsIGxvZy4NCj4gDQo+IEhpIFNhYWVkDQo+IA0KPiBJcyB0aGUgdGVtcGVyYXR1cmUgaXRzZWxm IGF2YWlsYWJsZT8gSWYgc28sIGl0IHdvdWxkIGJlIGJldHRlciB0bw0KPiBleHBvc2UgdGhpcyBh cyBhIGh3bW9uIGRldmljZSBwZXIgdGVtcGVyYXR1cmUgc2Vuc29yLg0KPiANCg0KSGkgQW5kcmV3 LCB5ZXMgdGhlIHRlbXBlcmF0dXJlIGlzIGF2YWlsYWJsZSBieSBvdGhlciBtZWFucywgdGhpcyBw YXRjaA0KaXMgbmVlZGVkIGZvciBhbGVydCBpbmZvcm1hdGlvbiByZWFzb25zIGluIG9yZGVyIHRv IGtub3cgd2hpY2ggaW50ZXJuYWwNCnNlbnNvcnMgdHJpZ2dlcmVkIHRoZSBhbGFybS4NCldlIGFy ZSB3b3JraW5nIGluIHBhcmFsbGVsIHRvIGV4cG9zZSB0ZW1wZXJhdHVyZSBzZW5zb3IgdG8gaHdt b24sIGJ1dA0KdGhpcyBpcyBzdGlsbCBXSVAuDQoNCg0KSXMgaXQgb2sgdG8gaGF2ZSBib3RoID8N Cg0KPiAgICAgICAgQW5kcmV3 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> Hi Andrew, yes the temperature is available by other means, this patch > is needed for alert information reasons in order to know which internal > sensors triggered the alarm. > We are working in parallel to expose temperature sensor to hwmon, but > this is still WIP. > > > Is it ok to have both ? Hi Saeed Ideally no. hwmon has mechanisms for setting alarm thresholds, and indicating the thresholds have been exceeded. There are also ways to tie this to thermal zones, so the system can react on overheating, slow down the CPU, drop voltages, ramp up fans, etc. hwmon should be your primary interface, not dmesg. But if you are stuck doing things in the wrong order, i guess it is O.K. I don't think dmesg is a Binary API, so you can remove it later. Andrew -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
T24gV2VkLCAyMDE4LTA1LTMwIGF0IDE4OjE3ICswMjAwLCBBbmRyZXcgTHVubiB3cm90ZToNCj4g PiBIaSBBbmRyZXcsIHllcyB0aGUgdGVtcGVyYXR1cmUgaXMgYXZhaWxhYmxlIGJ5IG90aGVyIG1l YW5zLCB0aGlzDQo+ID4gcGF0Y2gNCj4gPiBpcyBuZWVkZWQgZm9yIGFsZXJ0IGluZm9ybWF0aW9u IHJlYXNvbnMgaW4gb3JkZXIgdG8ga25vdyB3aGljaA0KPiA+IGludGVybmFsDQo+ID4gc2Vuc29y cyB0cmlnZ2VyZWQgdGhlIGFsYXJtLg0KPiA+IFdlIGFyZSB3b3JraW5nIGluIHBhcmFsbGVsIHRv IGV4cG9zZSB0ZW1wZXJhdHVyZSBzZW5zb3IgdG8gaHdtb24sDQo+ID4gYnV0DQo+ID4gdGhpcyBp cyBzdGlsbCBXSVAuDQo+ID4gDQo+ID4gDQo+ID4gSXMgaXQgb2sgdG8gaGF2ZSBib3RoID8NCj4g DQo+IEhpIFNhZWVkDQo+IA0KPiBJZGVhbGx5IG5vLiBod21vbiBoYXMgbWVjaGFuaXNtcyBmb3Ig c2V0dGluZyBhbGFybSB0aHJlc2hvbGRzLCBhbmQNCj4gaW5kaWNhdGluZyB0aGUgdGhyZXNob2xk cyBoYXZlIGJlZW4gZXhjZWVkZWQuIFRoZXJlIGFyZSBhbHNvIHdheXMgdG8NCj4gdGllIHRoaXMg dG8gdGhlcm1hbCB6b25lcywgc28gdGhlIHN5c3RlbSBjYW4gcmVhY3Qgb24gb3ZlcmhlYXRpbmcs DQo+IHNsb3cgZG93biB0aGUgQ1BVLCBkcm9wIHZvbHRhZ2VzLCByYW1wIHVwIGZhbnMsIGV0Yy4g aHdtb24gc2hvdWxkIGJlDQo+IHlvdXIgcHJpbWFyeSBpbnRlcmZhY2UsIG5vdCBkbWVzZy4NCj4g DQoNClllcyB3ZSBhcmUgd29ya2luZyBvbiB0aGlzLCBidXQgaXQgaXMgbm90IHNvbWV0aGluZyB0 aGF0IGNhbiBoYXBwZW4NCnNvb24gc2luY2Ugd2UgbmVlZCB0byBkZWZpbmUgdGhlIGNvcnJlY3Qg RmlybXdhcmUgQVBJcyB3aGljaCBhcmUgc3RpbGwNCldJUC4NCg0KPiBCdXQgaWYgeW91IGFyZSBz dHVjayBkb2luZyB0aGluZ3MgaW4gdGhlIHdyb25nIG9yZGVyLCBpIGd1ZXNzIGl0IGlzDQo+IE8u Sy4gSSBkb24ndCB0aGluayBkbWVzZyBpcyBhIEJpbmFyeSBBUEksIHNvIHlvdSBjYW4gcmVtb3Zl IGl0IGxhdGVyLg0KPiANCg0KWWVzIHRoaXMgaXMgdGhlIHBsYW4sIG9uY2UgdGhlIGh3bW9uIGlz IHN1cHBvcnRlZCB3ZSB3aWxsIHJlbW92ZSB0aGUNCmV4dHJhIGRtZXNnIHdhcm5pbmdzLg0KDQo+ ICAgICAgQW5kcmV3 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index c1c94974e16b..4bd4f011f0a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -141,6 +141,8 @@ static const char *eqe_type_str(u8 type) return "MLX5_EVENT_TYPE_GPIO_EVENT"; case MLX5_EVENT_TYPE_PORT_MODULE_EVENT: return "MLX5_EVENT_TYPE_PORT_MODULE_EVENT"; + case MLX5_EVENT_TYPE_TEMP_WARN_EVENT: + return "MLX5_EVENT_TYPE_TEMP_WARN_EVENT"; case MLX5_EVENT_TYPE_REMOTE_CONFIG: return "MLX5_EVENT_TYPE_REMOTE_CONFIG"; case MLX5_EVENT_TYPE_DB_BF_CONGESTION: @@ -393,6 +395,20 @@ static void general_event_handler(struct mlx5_core_dev *dev, } } +static void mlx5_temp_warning_event(struct mlx5_core_dev *dev, + struct mlx5_eqe *eqe) +{ + u64 value_lsb; + u64 value_msb; + + value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb); + value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb); + + mlx5_core_warn(dev, + "High temperature on sensors with bit set %llx %llx", + value_msb, value_lsb); +} + /* caller must eventually call mlx5_cq_put on the returned cq */ static struct mlx5_core_cq *mlx5_eq_cq_get(struct mlx5_eq *eq, u32 cqn) { @@ -547,6 +563,10 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) mlx5_fpga_event(dev, eqe->type, &eqe->data.raw); break; + case MLX5_EVENT_TYPE_TEMP_WARN_EVENT: + mlx5_temp_warning_event(dev, eqe); + break; + case MLX5_EVENT_TYPE_GENERAL_EVENT: general_event_handler(dev, eqe); break; @@ -824,6 +844,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED); + if (MLX5_CAP_GEN(dev, temp_warn_event)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_TEMP_WARN_EVENT); + err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, "mlx5_cmd_eq", MLX5_EQ_TYPE_ASYNC); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2bc27f8c5b87..eddacee5cf61 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -314,6 +314,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_PORT_CHANGE = 0x09, MLX5_EVENT_TYPE_GPIO_EVENT = 0x15, MLX5_EVENT_TYPE_PORT_MODULE_EVENT = 0x16, + MLX5_EVENT_TYPE_TEMP_WARN_EVENT = 0x17, MLX5_EVENT_TYPE_REMOTE_CONFIG = 0x19, MLX5_EVENT_TYPE_GENERAL_EVENT = 0x22, MLX5_EVENT_TYPE_PPS_EVENT = 0x25, @@ -626,6 +627,11 @@ struct mlx5_eqe_dct { __be32 dctn; }; +struct mlx5_eqe_temp_warning { + __be64 sensor_warning_msb; + __be64 sensor_warning_lsb; +} __packed; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -642,6 +648,7 @@ union ev_data { struct mlx5_eqe_port_module port_module; struct mlx5_eqe_pps pps; struct mlx5_eqe_dct dct; + struct mlx5_eqe_temp_warning temp_warning; } __packed; struct mlx5_eqe { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 10c1613d9434..ba30c26aa6eb 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -926,7 +926,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_msg[0x5]; u8 reserved_at_1c8[0x4]; u8 max_tc[0x4]; - u8 reserved_at_1d0[0x1]; + u8 temp_warn_event[0x1]; u8 dcbx[0x1]; u8 general_notification_event[0x1]; u8 reserved_at_1d3[0x2];