Created
August 24, 2021 05:26
-
-
Save al3xtjames/e7ffa911e1653dcc49a163a113fa970b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c | |
index 8d4ebe095..586279e41 100644 | |
--- a/drivers/pci/pci.c | |
+++ b/drivers/pci/pci.c | |
@@ -1066,6 +1066,13 @@ static int pci_raw_set_power_state(struct pci_dev *dev, pci_power_t state) | |
|| (state == PCI_D2 && !dev->d2_support)) | |
return -EIO; | |
+ /* | |
+ * Check if we have a bad combination of bridge controller and nvidia | |
+ * GPU, see quirk_broken_nv_runpm for more info | |
+ */ | |
+ if (state != PCI_D0 && dev->broken_nv_runpm) | |
+ return 0; | |
+ | |
pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); | |
if (pmcsr == (u16) ~0) { | |
pci_err(dev, "can't change power state from %s to %s (config space inaccessible)\n", | |
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c | |
index 6d74386ea..d301cd0c9 100644 | |
--- a/drivers/pci/quirks.c | |
+++ b/drivers/pci/quirks.c | |
@@ -5664,6 +5664,57 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, 0x13b1, | |
PCI_CLASS_DISPLAY_VGA, 8, | |
quirk_reset_lenovo_thinkpad_p50_nvgpu); | |
+/* | |
+ * Some Intel PCIe bridge controllers cause devices to not reappear doing a | |
+ * D0 -> D3hot -> D3cold -> D0 sequence. Skipping the intermediate D3hot step | |
+ * seems to make it work again. | |
+ * | |
+ * This leads to various manifestations of this issue: | |
+ * - AIML code execution hits an infinite loop (as the coe waits on device | |
+ * memory to change). | |
+ * - kernel crashes, as all PCI reads return -1, which most code isn't able | |
+ * to handle well enough. | |
+ * - sudden shutdowns, as the kernel identified an unrecoverable error after | |
+ * userspace tries to access the GPU. | |
+ * | |
+ * In all cases dmesg will contain at least one line like this: | |
+ * 'nouveau 0000:01:00.0: Refused to change power state, currently in D3' | |
+ * followed by a lot of nouveau timeouts. | |
+ * | |
+ * ACPI code writes bit 0x80 to the not documented PCI register 0x248 of the | |
+ * Intel PCIe bridge controller (0x1901) in order to power down the GPU. | |
+ * Nonetheless, there are other code paths inside the ACPI firmware which use | |
+ * other registers, which seem to work fine: | |
+ * - 0xbc bit 0x20 (publicly available documentation claims 'reserved') | |
+ * - 0xb0 bit 0x10 (link disable) | |
+ * Changing the conditions inside the firmware by poking into the relevant | |
+ * addresses does resolve the issue, but it seemed to be ACPI private memory | |
+ * and not any device accessible memory at all, so there is no portable way of | |
+ * changing the conditions. | |
+ * | |
+ * The only systems where this behavior can be seen are hybrid graphics laptops | |
+ * with a secondary Nvidia Maxwell, Pascal or Turing GPU. It cannot be ruled | |
+ * out that this issue only occurs in combination with listed Intel PCIe | |
+ * bridge controllers and the mentioned GPUs or if it's only a hw bug in the | |
+ * bridge controller. | |
+ */ | |
+ | |
+static void quirk_broken_nv_runpm(struct pci_dev *dev) | |
+{ | |
+ struct pci_dev *bridge = pci_upstream_bridge(dev); | |
+ | |
+ if (!bridge || bridge->vendor != PCI_VENDOR_ID_INTEL) | |
+ return; | |
+ | |
+ switch (bridge->device) { | |
+ case 0x1901: | |
+ dev->broken_nv_runpm = 1; | |
+ } | |
+} | |
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, | |
+ PCI_BASE_CLASS_DISPLAY, 16, | |
+ quirk_broken_nv_runpm); | |
+ | |
/* | |
* Device [1b21:2142] | |
* When in D0, PME# doesn't get asserted when plugging USB 3.0 device. | |
diff --git a/include/linux/pci.h b/include/linux/pci.h | |
index 243065042..83e56b08f 100644 | |
--- a/include/linux/pci.h | |
+++ b/include/linux/pci.h | |
@@ -446,6 +446,7 @@ struct pci_dev { | |
*/ | |
unsigned int external_facing:1; | |
unsigned int broken_intx_masking:1; /* INTx masking can't be used */ | |
+ unsigned int broken_nv_runpm:1; /* some combinations of intel bridge controller and nvidia GPUs break rtd3 */ | |
unsigned int io_window_1k:1; /* Intel bridge 1K I/O windows */ | |
unsigned int irq_managed:1; | |
unsigned int non_compliant_bars:1; /* Broken BARs; ignore them */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment