Message ID | 20241031153925.36216-3-amit@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Add support for the ERAPS feature | expand |
On Thu, Oct 31, 2024 at 04:39:25PM +0100, Amit Shah wrote: > @@ -3559,6 +3582,27 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) > > trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); > > + if (boot_cpu_has(X86_FEATURE_ERAPS) > + && vmcb_is_larger_rap(svm->vmcb01.ptr)) { ^ This should be at the end of previous line.
Hi Amit, kernel test robot noticed the following build warnings: [auto build test WARNING on kvm/queue] [also build test WARNING on mst-vhost/linux-next tip/master tip/x86/core linus/master v6.12-rc5 next-20241031] [cannot apply to kvm/linux-next tip/auto-latest] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Amit-Shah/x86-cpu-bugs-add-support-for-AMD-ERAPS-feature/20241031-234332 base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue patch link: https://lore.kernel.org/r/20241031153925.36216-3-amit%40kernel.org patch subject: [PATCH 2/2] x86: kvm: svm: add support for ERAPS and FLUSH_RAP_ON_VMRUN config: x86_64-kexec (https://download.01.org/0day-ci/archive/20241101/202411011119.l3yRJpht-lkp@intel.com/config) compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241101/202411011119.l3yRJpht-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202411011119.l3yRJpht-lkp@intel.com/ All warnings (new ones prefixed by >>): In file included from arch/x86/kvm/cpuid.c:13: In file included from include/linux/kvm_host.h:16: In file included from include/linux/mm.h:2213: include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion] 504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~ ^ 505 | item]; | ~~~~ include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion] 511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~ ^ 512 | NR_VM_NUMA_EVENT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~~ include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ include/linux/vmstat.h:524:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion] 524 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~ ^ 525 | NR_VM_NUMA_EVENT_ITEMS + | ~~~~~~~~~~~~~~~~~~~~~~ >> arch/x86/kvm/cpuid.c:1362:3: warning: label followed by a declaration is a C23 extension [-Wc23-extensions] 1362 | unsigned int ebx_mask = 0; | ^ 5 warnings generated. vim +1362 arch/x86/kvm/cpuid.c 940 941 static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) 942 { 943 struct kvm_cpuid_entry2 *entry; 944 int r, i, max_idx; 945 946 /* all calls to cpuid_count() should be made on the same cpu */ 947 get_cpu(); 948 949 r = -E2BIG; 950 951 entry = do_host_cpuid(array, function, 0); 952 if (!entry) 953 goto out; 954 955 switch (function) { 956 case 0: 957 /* Limited to the highest leaf implemented in KVM. */ 958 entry->eax = min(entry->eax, 0x24U); 959 break; 960 case 1: 961 cpuid_entry_override(entry, CPUID_1_EDX); 962 cpuid_entry_override(entry, CPUID_1_ECX); 963 break; 964 case 2: 965 /* 966 * On ancient CPUs, function 2 entries are STATEFUL. That is, 967 * CPUID(function=2, index=0) may return different results each 968 * time, with the least-significant byte in EAX enumerating the 969 * number of times software should do CPUID(2, 0). 970 * 971 * Modern CPUs, i.e. every CPU KVM has *ever* run on are less 972 * idiotic. Intel's SDM states that EAX & 0xff "will always 973 * return 01H. Software should ignore this value and not 974 * interpret it as an informational descriptor", while AMD's 975 * APM states that CPUID(2) is reserved. 976 * 977 * WARN if a frankenstein CPU that supports virtualization and 978 * a stateful CPUID.0x2 is encountered. 979 */ 980 WARN_ON_ONCE((entry->eax & 0xff) > 1); 981 break; 982 /* functions 4 and 0x8000001d have additional index. */ 983 case 4: 984 case 0x8000001d: 985 /* 986 * Read entries until the cache type in the previous entry is 987 * zero, i.e. indicates an invalid entry. 988 */ 989 for (i = 1; entry->eax & 0x1f; ++i) { 990 entry = do_host_cpuid(array, function, i); 991 if (!entry) 992 goto out; 993 } 994 break; 995 case 6: /* Thermal management */ 996 entry->eax = 0x4; /* allow ARAT */ 997 entry->ebx = 0; 998 entry->ecx = 0; 999 entry->edx = 0; 1000 break; 1001 /* function 7 has additional index. */ 1002 case 7: 1003 max_idx = entry->eax = min(entry->eax, 2u); 1004 cpuid_entry_override(entry, CPUID_7_0_EBX); 1005 cpuid_entry_override(entry, CPUID_7_ECX); 1006 cpuid_entry_override(entry, CPUID_7_EDX); 1007 1008 /* KVM only supports up to 0x7.2, capped above via min(). */ 1009 if (max_idx >= 1) { 1010 entry = do_host_cpuid(array, function, 1); 1011 if (!entry) 1012 goto out; 1013 1014 cpuid_entry_override(entry, CPUID_7_1_EAX); 1015 cpuid_entry_override(entry, CPUID_7_1_EDX); 1016 entry->ebx = 0; 1017 entry->ecx = 0; 1018 } 1019 if (max_idx >= 2) { 1020 entry = do_host_cpuid(array, function, 2); 1021 if (!entry) 1022 goto out; 1023 1024 cpuid_entry_override(entry, CPUID_7_2_EDX); 1025 entry->ecx = 0; 1026 entry->ebx = 0; 1027 entry->eax = 0; 1028 } 1029 break; 1030 case 0xa: { /* Architectural Performance Monitoring */ 1031 union cpuid10_eax eax; 1032 union cpuid10_edx edx; 1033 1034 if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { 1035 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1036 break; 1037 } 1038 1039 eax.split.version_id = kvm_pmu_cap.version; 1040 eax.split.num_counters = kvm_pmu_cap.num_counters_gp; 1041 eax.split.bit_width = kvm_pmu_cap.bit_width_gp; 1042 eax.split.mask_length = kvm_pmu_cap.events_mask_len; 1043 edx.split.num_counters_fixed = kvm_pmu_cap.num_counters_fixed; 1044 edx.split.bit_width_fixed = kvm_pmu_cap.bit_width_fixed; 1045 1046 if (kvm_pmu_cap.version) 1047 edx.split.anythread_deprecated = 1; 1048 edx.split.reserved1 = 0; 1049 edx.split.reserved2 = 0; 1050 1051 entry->eax = eax.full; 1052 entry->ebx = kvm_pmu_cap.events_mask; 1053 entry->ecx = 0; 1054 entry->edx = edx.full; 1055 break; 1056 } 1057 case 0x1f: 1058 case 0xb: 1059 /* 1060 * No topology; a valid topology is indicated by the presence 1061 * of subleaf 1. 1062 */ 1063 entry->eax = entry->ebx = entry->ecx = 0; 1064 break; 1065 case 0xd: { 1066 u64 permitted_xcr0 = kvm_get_filtered_xcr0(); 1067 u64 permitted_xss = kvm_caps.supported_xss; 1068 1069 entry->eax &= permitted_xcr0; 1070 entry->ebx = xstate_required_size(permitted_xcr0, false); 1071 entry->ecx = entry->ebx; 1072 entry->edx &= permitted_xcr0 >> 32; 1073 if (!permitted_xcr0) 1074 break; 1075 1076 entry = do_host_cpuid(array, function, 1); 1077 if (!entry) 1078 goto out; 1079 1080 cpuid_entry_override(entry, CPUID_D_1_EAX); 1081 if (entry->eax & (F(XSAVES)|F(XSAVEC))) 1082 entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss, 1083 true); 1084 else { 1085 WARN_ON_ONCE(permitted_xss != 0); 1086 entry->ebx = 0; 1087 } 1088 entry->ecx &= permitted_xss; 1089 entry->edx &= permitted_xss >> 32; 1090 1091 for (i = 2; i < 64; ++i) { 1092 bool s_state; 1093 if (permitted_xcr0 & BIT_ULL(i)) 1094 s_state = false; 1095 else if (permitted_xss & BIT_ULL(i)) 1096 s_state = true; 1097 else 1098 continue; 1099 1100 entry = do_host_cpuid(array, function, i); 1101 if (!entry) 1102 goto out; 1103 1104 /* 1105 * The supported check above should have filtered out 1106 * invalid sub-leafs. Only valid sub-leafs should 1107 * reach this point, and they should have a non-zero 1108 * save state size. Furthermore, check whether the 1109 * processor agrees with permitted_xcr0/permitted_xss 1110 * on whether this is an XCR0- or IA32_XSS-managed area. 1111 */ 1112 if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { 1113 --array->nent; 1114 continue; 1115 } 1116 1117 if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) 1118 entry->ecx &= ~BIT_ULL(2); 1119 entry->edx = 0; 1120 } 1121 break; 1122 } 1123 case 0x12: 1124 /* Intel SGX */ 1125 if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) { 1126 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1127 break; 1128 } 1129 1130 /* 1131 * Index 0: Sub-features, MISCSELECT (a.k.a extended features) 1132 * and max enclave sizes. The SGX sub-features and MISCSELECT 1133 * are restricted by kernel and KVM capabilities (like most 1134 * feature flags), while enclave size is unrestricted. 1135 */ 1136 cpuid_entry_override(entry, CPUID_12_EAX); 1137 entry->ebx &= SGX_MISC_EXINFO; 1138 1139 entry = do_host_cpuid(array, function, 1); 1140 if (!entry) 1141 goto out; 1142 1143 /* 1144 * Index 1: SECS.ATTRIBUTES. ATTRIBUTES are restricted a la 1145 * feature flags. Advertise all supported flags, including 1146 * privileged attributes that require explicit opt-in from 1147 * userspace. ATTRIBUTES.XFRM is not adjusted as userspace is 1148 * expected to derive it from supported XCR0. 1149 */ 1150 entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK; 1151 entry->ebx &= 0; 1152 break; 1153 /* Intel PT */ 1154 case 0x14: 1155 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) { 1156 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1157 break; 1158 } 1159 1160 for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { 1161 if (!do_host_cpuid(array, function, i)) 1162 goto out; 1163 } 1164 break; 1165 /* Intel AMX TILE */ 1166 case 0x1d: 1167 if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { 1168 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1169 break; 1170 } 1171 1172 for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { 1173 if (!do_host_cpuid(array, function, i)) 1174 goto out; 1175 } 1176 break; 1177 case 0x1e: /* TMUL information */ 1178 if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) { 1179 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1180 break; 1181 } 1182 break; 1183 case 0x24: { 1184 u8 avx10_version; 1185 1186 if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) { 1187 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1188 break; 1189 } 1190 1191 /* 1192 * The AVX10 version is encoded in EBX[7:0]. Note, the version 1193 * is guaranteed to be >=1 if AVX10 is supported. Note #2, the 1194 * version needs to be captured before overriding EBX features! 1195 */ 1196 avx10_version = min_t(u8, entry->ebx & 0xff, 1); 1197 cpuid_entry_override(entry, CPUID_24_0_EBX); 1198 entry->ebx |= avx10_version; 1199 1200 entry->eax = 0; 1201 entry->ecx = 0; 1202 entry->edx = 0; 1203 break; 1204 } 1205 case KVM_CPUID_SIGNATURE: { 1206 const u32 *sigptr = (const u32 *)KVM_SIGNATURE; 1207 entry->eax = KVM_CPUID_FEATURES; 1208 entry->ebx = sigptr[0]; 1209 entry->ecx = sigptr[1]; 1210 entry->edx = sigptr[2]; 1211 break; 1212 } 1213 case KVM_CPUID_FEATURES: 1214 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | 1215 (1 << KVM_FEATURE_NOP_IO_DELAY) | 1216 (1 << KVM_FEATURE_CLOCKSOURCE2) | 1217 (1 << KVM_FEATURE_ASYNC_PF) | 1218 (1 << KVM_FEATURE_PV_EOI) | 1219 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | 1220 (1 << KVM_FEATURE_PV_UNHALT) | 1221 (1 << KVM_FEATURE_PV_TLB_FLUSH) | 1222 (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | 1223 (1 << KVM_FEATURE_PV_SEND_IPI) | 1224 (1 << KVM_FEATURE_POLL_CONTROL) | 1225 (1 << KVM_FEATURE_PV_SCHED_YIELD) | 1226 (1 << KVM_FEATURE_ASYNC_PF_INT); 1227 1228 if (sched_info_on()) 1229 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); 1230 1231 entry->ebx = 0; 1232 entry->ecx = 0; 1233 entry->edx = 0; 1234 break; 1235 case 0x80000000: 1236 entry->eax = min(entry->eax, 0x80000022); 1237 /* 1238 * Serializing LFENCE is reported in a multitude of ways, and 1239 * NullSegClearsBase is not reported in CPUID on Zen2; help 1240 * userspace by providing the CPUID leaf ourselves. 1241 * 1242 * However, only do it if the host has CPUID leaf 0x8000001d. 1243 * QEMU thinks that it can query the host blindly for that 1244 * CPUID leaf if KVM reports that it supports 0x8000001d or 1245 * above. The processor merrily returns values from the 1246 * highest Intel leaf which QEMU tries to use as the guest's 1247 * 0x8000001d. Even worse, this can result in an infinite 1248 * loop if said highest leaf has no subleaves indexed by ECX. 1249 */ 1250 if (entry->eax >= 0x8000001d && 1251 (static_cpu_has(X86_FEATURE_LFENCE_RDTSC) 1252 || !static_cpu_has_bug(X86_BUG_NULL_SEG))) 1253 entry->eax = max(entry->eax, 0x80000021); 1254 break; 1255 case 0x80000001: 1256 entry->ebx &= ~GENMASK(27, 16); 1257 cpuid_entry_override(entry, CPUID_8000_0001_EDX); 1258 cpuid_entry_override(entry, CPUID_8000_0001_ECX); 1259 break; 1260 case 0x80000005: 1261 /* Pass host L1 cache and TLB info. */ 1262 break; 1263 case 0x80000006: 1264 /* Drop reserved bits, pass host L2 cache and TLB info. */ 1265 entry->edx &= ~GENMASK(17, 16); 1266 break; 1267 case 0x80000007: /* Advanced power management */ 1268 cpuid_entry_override(entry, CPUID_8000_0007_EDX); 1269 1270 /* mask against host */ 1271 entry->edx &= boot_cpu_data.x86_power; 1272 entry->eax = entry->ebx = entry->ecx = 0; 1273 break; 1274 case 0x80000008: { 1275 /* 1276 * GuestPhysAddrSize (EAX[23:16]) is intended for software 1277 * use. 1278 * 1279 * KVM's ABI is to report the effective MAXPHYADDR for the 1280 * guest in PhysAddrSize (phys_as), and the maximum 1281 * *addressable* GPA in GuestPhysAddrSize (g_phys_as). 1282 * 1283 * GuestPhysAddrSize is valid if and only if TDP is enabled, 1284 * in which case the max GPA that can be addressed by KVM may 1285 * be less than the max GPA that can be legally generated by 1286 * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't 1287 * support 5-level TDP. 1288 */ 1289 unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U); 1290 unsigned int phys_as, g_phys_as; 1291 1292 /* 1293 * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as 1294 * the guest operates in the same PA space as the host, i.e. 1295 * reductions in MAXPHYADDR for memory encryption affect shadow 1296 * paging, too. 1297 * 1298 * If TDP is enabled, use the raw bare metal MAXPHYADDR as 1299 * reductions to the HPAs do not affect GPAs. The max 1300 * addressable GPA is the same as the max effective GPA, except 1301 * that it's capped at 48 bits if 5-level TDP isn't supported 1302 * (hardware processes bits 51:48 only when walking the fifth 1303 * level page table). 1304 */ 1305 if (!tdp_enabled) { 1306 phys_as = boot_cpu_data.x86_phys_bits; 1307 g_phys_as = 0; 1308 } else { 1309 phys_as = entry->eax & 0xff; 1310 g_phys_as = phys_as; 1311 if (kvm_mmu_get_max_tdp_level() < 5) 1312 g_phys_as = min(g_phys_as, 48); 1313 } 1314 1315 entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); 1316 entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); 1317 entry->edx = 0; 1318 cpuid_entry_override(entry, CPUID_8000_0008_EBX); 1319 break; 1320 } 1321 case 0x8000000A: 1322 if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) { 1323 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1324 break; 1325 } 1326 entry->eax = 1; /* SVM revision 1 */ 1327 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper 1328 ASID emulation to nested SVM */ 1329 entry->ecx = 0; /* Reserved */ 1330 cpuid_entry_override(entry, CPUID_8000_000A_EDX); 1331 break; 1332 case 0x80000019: 1333 entry->ecx = entry->edx = 0; 1334 break; 1335 case 0x8000001a: 1336 entry->eax &= GENMASK(2, 0); 1337 entry->ebx = entry->ecx = entry->edx = 0; 1338 break; 1339 case 0x8000001e: 1340 /* Do not return host topology information. */ 1341 entry->eax = entry->ebx = entry->ecx = 0; 1342 entry->edx = 0; /* reserved */ 1343 break; 1344 case 0x8000001F: 1345 if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { 1346 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1347 } else { 1348 cpuid_entry_override(entry, CPUID_8000_001F_EAX); 1349 /* Clear NumVMPL since KVM does not support VMPL. */ 1350 entry->ebx &= ~GENMASK(31, 12); 1351 /* 1352 * Enumerate '0' for "PA bits reduction", the adjusted 1353 * MAXPHYADDR is enumerated directly (see 0x80000008). 1354 */ 1355 entry->ebx &= ~GENMASK(11, 6); 1356 } 1357 break; 1358 case 0x80000020: 1359 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1360 break; 1361 case 0x80000021: > 1362 unsigned int ebx_mask = 0; 1363 1364 entry->ecx = entry->edx = 0; 1365 cpuid_entry_override(entry, CPUID_8000_0021_EAX); 1366 1367 /* 1368 * Bits 23:16 in EBX indicate the size of the RSB. 1369 * Expose the value in the hardware to the guest. 1370 */ 1371 if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) 1372 ebx_mask |= GENMASK(23, 16); 1373 1374 entry->ebx &= ebx_mask; 1375 break; 1376 /* AMD Extended Performance Monitoring and Debug */ 1377 case 0x80000022: { 1378 union cpuid_0x80000022_ebx ebx; 1379 1380 entry->ecx = entry->edx = 0; 1381 if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { 1382 entry->eax = entry->ebx; 1383 break; 1384 } 1385 1386 cpuid_entry_override(entry, CPUID_8000_0022_EAX); 1387 1388 if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) 1389 ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; 1390 else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) 1391 ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; 1392 else 1393 ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; 1394 1395 entry->ebx = ebx.full; 1396 break; 1397 } 1398 /*Add support for Centaur's CPUID instruction*/ 1399 case 0xC0000000: 1400 /*Just support up to 0xC0000004 now*/ 1401 entry->eax = min(entry->eax, 0xC0000004); 1402 break; 1403 case 0xC0000001: 1404 cpuid_entry_override(entry, CPUID_C000_0001_EDX); 1405 break; 1406 case 3: /* Processor serial number */ 1407 case 5: /* MONITOR/MWAIT */ 1408 case 0xC0000002: 1409 case 0xC0000003: 1410 case 0xC0000004: 1411 default: 1412 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 1413 break; 1414 } 1415 1416 r = 0; 1417 1418 out: 1419 put_cpu(); 1420 1421 return r; 1422 } 1423
On Fri, 2024-11-01 at 12:14 +0800, kernel test robot wrote: > Hi Amit, > > kernel test robot noticed the following build warnings: > > [auto build test WARNING on kvm/queue] > [also build test WARNING on mst-vhost/linux-next tip/master > tip/x86/core linus/master v6.12-rc5 next-20241031] > [cannot apply to kvm/linux-next tip/auto-latest] > [If your patch is applied to the wrong git tree, kindly drop us a > note. > And when submitting patch, we suggest to use '--base' as documented > in > https://git-scm.com/docs/git-format-patch#_base_tree_information] > > url: > https://github.com/intel-lab-lkp/linux/commits/Amit-Shah/x86-cpu-bugs-add-support-for-AMD-ERAPS-feature/20241031-234332 > base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue > patch link: > https://lore.kernel.org/r/20241031153925.36216-3-amit%40kernel.org > patch subject: [PATCH 2/2] x86: kvm: svm: add support for ERAPS and > FLUSH_RAP_ON_VMRUN > config: x86_64-kexec > (https://download.01.org/0day-ci/archive/20241101/202411011119.l3yRJp > ht-lkp@intel.com/config) > compiler: clang version 19.1.3 > (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b25 > 4b6afab99) > reproduce (this is a W=1 build): > (https://download.01.org/0day-ci/archive/20241101/202411011119.l3yRJp > ht-lkp@intel.com/reproduce) > > If you fix the issue in a separate patch/commit (i.e. not just a new > version of > the same patch/commit), kindly add following tags > > Reported-by: kernel test robot <lkp@intel.com> > > Closes: > > https://lore.kernel.org/oe-kbuild-all/202411011119.l3yRJpht-lkp@intel.com/ > > All warnings (new ones prefixed by >>): > > In file included from arch/x86/kvm/cpuid.c:13: > In file included from include/linux/kvm_host.h:16: > In file included from include/linux/mm.h:2213: > include/linux/vmstat.h:504:43: warning: arithmetic between > different enumeration types ('enum zone_stat_item' and 'enum > numa_stat_item') [-Wenum-enum-conversion] > 504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~ ^ > 505 | item]; > | ~~~~ > include/linux/vmstat.h:511:43: warning: arithmetic between > different enumeration types ('enum zone_stat_item' and 'enum > numa_stat_item') [-Wenum-enum-conversion] > 511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~ ^ > 512 | NR_VM_NUMA_EVENT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~~ > include/linux/vmstat.h:518:36: warning: arithmetic between > different enumeration types ('enum node_stat_item' and 'enum > lru_list') [-Wenum-enum-conversion] > 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // > skip "nr_" > | ~~~~~~~~~~~ ^ ~~~ > include/linux/vmstat.h:524:43: warning: arithmetic between > different enumeration types ('enum zone_stat_item' and 'enum > numa_stat_item') [-Wenum-enum-conversion] > 524 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~ ^ > 525 | NR_VM_NUMA_EVENT_ITEMS + > | ~~~~~~~~~~~~~~~~~~~~~~ > > > arch/x86/kvm/cpuid.c:1362:3: warning: label followed by a > > > declaration is a C23 extension [-Wc23-extensions] > 1362 | unsigned int ebx_mask = 0; > | ^ > 5 warnings generated. [...] > 1361 case 0x80000021: > > 1362 unsigned int ebx_mask = 0; > 1363 > 1364 entry->ecx = entry->edx = 0; > 1365 cpuid_entry_override(entry, > CPUID_8000_0021_EAX); > 1366 > 1367 /* > 1368 * Bits 23:16 in EBX indicate the size of > the RSB. > 1369 * Expose the value in the hardware to the > guest. > 1370 */ > 1371 if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) > 1372 ebx_mask |= GENMASK(23, 16); > 1373 > 1374 entry->ebx &= ebx_mask; > 1375 break; Right - I'll add braces around this case statement. Amit
On Thu, Oct 31, 2024 at 04:39:25PM +0100, Amit Shah wrote:
> + if (boot_cpu_has(X86_FEATURE_ERAPS) && npt_enabled)
s/boot_cpu_has/cpu_feature_enabled/g
On Mon, 2024-11-04 at 06:18 +0100, Borislav Petkov wrote: > On Thu, Oct 31, 2024 at 04:39:25PM +0100, Amit Shah wrote: > > + if (boot_cpu_has(X86_FEATURE_ERAPS) && npt_enabled) > > s/boot_cpu_has/cpu_feature_enabled/g ACK
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 2b59b9951c90..f8584a63c859 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -129,7 +129,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 tsc_offset; u32 asid; u8 tlb_ctl; - u8 reserved_2[3]; + u8 erap_ctl; + u8 reserved_2[2]; u32 int_ctl; u32 int_vector; u32 int_state; @@ -175,6 +176,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define TLB_CONTROL_FLUSH_ASID 3 #define TLB_CONTROL_FLUSH_ASID_LOCAL 7 +#define ERAP_CONTROL_ALLOW_LARGER_RAP 0 +#define ERAP_CONTROL_FLUSH_RAP 1 + #define V_TPR_MASK 0x0f #define V_IRQ_SHIFT 8 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 41786b834b16..2c2a60964a2e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -797,6 +797,8 @@ void kvm_set_cpu_caps(void) F(WRMSR_XX_BASE_NS) ); + if (tdp_enabled) + kvm_cpu_cap_check_and_set(X86_FEATURE_ERAPS); kvm_cpu_cap_check_and_set(X86_FEATURE_SBPB); kvm_cpu_cap_check_and_set(X86_FEATURE_IBPB_BRTYPE); kvm_cpu_cap_check_and_set(X86_FEATURE_SRSO_NO); @@ -1357,8 +1359,19 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; case 0x80000021: - entry->ebx = entry->ecx = entry->edx = 0; + unsigned int ebx_mask = 0; + + entry->ecx = entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); + + /* + * Bits 23:16 in EBX indicate the size of the RSB. + * Expose the value in the hardware to the guest. + */ + if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) + ebx_mask |= GENMASK(23, 16); + + entry->ebx &= ebx_mask; break; /* AMD Extended Performance Monitoring and Debug */ case 0x80000022: { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9df3e1e5ae81..ecd290ff38f8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1360,6 +1360,28 @@ static void init_vmcb(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); + /* + * If the hardware has a larger RSB, use it in the guest context as + * well. + * + * When running nested guests: the hardware tags host and guest RSB + * entries, but the entries are ASID agnostic. Differentiating L1 and + * L2 guests isn't possible in hardware. To prevent L2->L1 RSB + * poisoning attacks in this case, the L0 hypervisor must set + * FLUSH_RAP_ON_VMRUN in the L1's VMCB on a nested #VMEXIT to ensure + * the next VMRUN flushes the RSB. + * + * For shadow paging / NPT disabled case: the CPU's CR3 does not + * contain the CR3 of the running guest process, and hence intra-guest + * context switches will not cause a hardware TLB flush, which in turn + * does not result in a guest RSB flush that the ERAPS feature + * provides. Do not expose ERAPS or the larger RSB to the guest in + * this case, so the guest continues implementing software mitigations + * as well as only sees 32 entries for the RSB. + */ + if (boot_cpu_has(X86_FEATURE_ERAPS) && npt_enabled) + vmcb_set_larger_rap(svm->vmcb); + if (kvm_vcpu_apicv_active(vcpu)) avic_init_vmcb(svm, vmcb); @@ -3393,6 +3415,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); pr_err("%-20s%d\n", "asid:", control->asid); pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); + pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl); pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); pr_err("%-20s%08x\n", "int_vector:", control->int_vector); pr_err("%-20s%08x\n", "int_state:", control->int_state); @@ -3559,6 +3582,27 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); + if (boot_cpu_has(X86_FEATURE_ERAPS) + && vmcb_is_larger_rap(svm->vmcb01.ptr)) { + /* + * XXX a few further optimizations can be made: + * + * 1. In pre_svm_run() we can reset this bit when a hw + * TLB flush has happened - any context switch on a + * CPU (which causes a TLB flush) auto-flushes the RSB + * - eg when this vCPU is scheduled on a different + * pCPU. + * + * 2. This is also not needed in the case where the + * vCPU is being scheduled on the same pCPU, but there + * was a context switch between the #VMEXIT and VMRUN. + * + * 3. If the guest returns to L2 again after this + * #VMEXIT, there's no need to flush the RSB. + */ + vmcb_set_flush_rap(svm->vmcb01.ptr); + } + vmexit = nested_svm_exit_special(svm); if (vmexit == NESTED_EXIT_CONTINUE) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 43fa6a16eb19..8a7877f46dc5 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -500,6 +500,21 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) return vmcb_is_intercept(&svm->vmcb->control, bit); } +static inline void vmcb_set_flush_rap(struct vmcb *vmcb) +{ + __set_bit(ERAP_CONTROL_FLUSH_RAP, (unsigned long *)&vmcb->control.erap_ctl); +} + +static inline void vmcb_set_larger_rap(struct vmcb *vmcb) +{ + __set_bit(ERAP_CONTROL_ALLOW_LARGER_RAP, (unsigned long *)&vmcb->control.erap_ctl); +} + +static inline bool vmcb_is_larger_rap(struct vmcb *vmcb) +{ + return test_bit(ERAP_CONTROL_ALLOW_LARGER_RAP, (unsigned long *)&vmcb->control.erap_ctl); +} + static inline bool nested_vgif_enabled(struct vcpu_svm *svm) { return guest_can_use(&svm->vcpu, X86_FEATURE_VGIF) &&