From 7e978866be3ff079d72c1ac400fba91d0be90d34 Mon Sep 17 00:00:00 2001 From: Yang Bo Date: Wed, 28 May 2025 11:18:19 +0800 Subject: [PATCH 01/11] drm/ttm: Use dmacoherent allocations on ARM64 Various PCIe controllers on ARM64 platforms don't support cache snooping, which leads to numerous issues when attempting to use PCIe graphics cards. Switching ttm_prot_from_caching to return pgprot_dmacoherent for ttm_cached pages solves the issue, albeit with a performance hit. There is a second check in ttm_prot_from_caching that also needs updating. Signed-off-by: Yang Bo Signed-off-by: Dave Stevenson --- drivers/gpu/drm/ttm/ttm_bo_util.c | 3 ++- drivers/gpu/drm/ttm/ttm_module.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index acbbca9d5c92f0..7c694b887df652 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -359,7 +359,8 @@ static int ttm_bo_kmap_ttm(struct ttm_buffer_object *bo, if (ret) return ret; - if (num_pages == 1 && ttm->caching == ttm_cached && + if (!IS_ENABLED(CONFIG_ARM64) && + num_pages == 1 && ttm->caching == ttm_cached && !(man->use_tt && (ttm->page_flags & TTM_TT_FLAG_DECRYPTED))) { /* * We're mapping a single page, and the desired diff --git a/drivers/gpu/drm/ttm/ttm_module.c b/drivers/gpu/drm/ttm/ttm_module.c index b3fffe7b5062a9..6966495bdd499b 100644 --- a/drivers/gpu/drm/ttm/ttm_module.c +++ b/drivers/gpu/drm/ttm/ttm_module.c @@ -63,7 +63,11 @@ pgprot_t ttm_prot_from_caching(enum ttm_caching caching, pgprot_t tmp) { /* Cached mappings need no adjustment */ if (caching == ttm_cached) +#ifdef CONFIG_ARM64 + return pgprot_dmacoherent(tmp); +#else return tmp; +#endif #if defined(__i386__) || defined(__x86_64__) if (caching == ttm_write_combined) From 21c2e158e8e6452483f1a2fbe780b182cd28feaf Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Tue, 30 Sep 2025 15:02:52 +0100 Subject: [PATCH 02/11] defconfig: Add amdgpu and radeon drivers to 2711/2712 defconfig Also includes SND_HDA_* modules for audio on AMD GPUs. Signed-off-by: Dave Stevenson --- arch/arm64/configs/bcm2711_defconfig | 7 +++++++ arch/arm64/configs/bcm2712_defconfig | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/arch/arm64/configs/bcm2711_defconfig b/arch/arm64/configs/bcm2711_defconfig index f7081ebff6bd2f..059aeae24df7c3 100644 --- a/arch/arm64/configs/bcm2711_defconfig +++ b/arch/arm64/configs/bcm2711_defconfig @@ -1070,6 +1070,10 @@ CONFIG_AUXDISPLAY=y CONFIG_HD44780=m CONFIG_DRM=m CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_RADEON=m +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y CONFIG_DRM_UDL=m CONFIG_DRM_PANEL_LVDS=m CONFIG_DRM_PANEL_ILITEK_IL79600A=m @@ -1133,6 +1137,9 @@ CONFIG_SND_SERIAL_U16550=m CONFIG_SND_MPU401=m CONFIG_SND_PIMIDI=m CONFIG_SND_PISOUND_MICRO=m +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_CODEC_HDMI=m CONFIG_SND_USB_AUDIO=m CONFIG_SND_USB_UA101=m CONFIG_SND_USB_CAIAQ=m diff --git a/arch/arm64/configs/bcm2712_defconfig b/arch/arm64/configs/bcm2712_defconfig index f873143ea5ffb5..49c5d679fe8cb7 100644 --- a/arch/arm64/configs/bcm2712_defconfig +++ b/arch/arm64/configs/bcm2712_defconfig @@ -1072,6 +1072,10 @@ CONFIG_AUXDISPLAY=y CONFIG_HD44780=m CONFIG_DRM=m CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_RADEON=m +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y CONFIG_DRM_UDL=m CONFIG_DRM_PANEL_LVDS=m CONFIG_DRM_PANEL_ILITEK_IL79600A=m @@ -1135,6 +1139,9 @@ CONFIG_SND_SERIAL_U16550=m CONFIG_SND_MPU401=m CONFIG_SND_PIMIDI=m CONFIG_SND_PISOUND_MICRO=m +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_CODEC_HDMI=m CONFIG_SND_USB_AUDIO=m CONFIG_SND_USB_UA101=m CONFIG_SND_USB_CAIAQ=m From 8b48625dc3cdf1cfe30cfad7f28356f7444741cb Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Wed, 1 Oct 2025 15:48:14 +0100 Subject: [PATCH 03/11] drm/i915: Only reset console if enabled Taken from https://github.com/chimera-linux/cports/blob/master/main/linux-stable/patches/xe-nonx86.patch Signed-off-by: Dave Stevenson --- drivers/gpu/drm/i915/display/intel_vga.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_vga.c b/drivers/gpu/drm/i915/display/intel_vga.c index 6e125564db34c9..31d3f1f504d26f 100644 --- a/drivers/gpu/drm/i915/display/intel_vga.c +++ b/drivers/gpu/drm/i915/display/intel_vga.c @@ -78,6 +78,7 @@ void intel_vga_disable(struct intel_display *display) void intel_vga_reset_io_mem(struct intel_display *display) { +#if defined(CONFIG_VGA_CONSOLE) struct pci_dev *pdev = to_pci_dev(display->drm->dev); /* @@ -93,6 +94,7 @@ void intel_vga_reset_io_mem(struct intel_display *display) vga_get_uninterruptible(pdev, VGA_RSRC_LEGACY_IO); outb(inb(VGA_MIS_R), VGA_MIS_W); vga_put(pdev, VGA_RSRC_LEGACY_IO); +#endif } int intel_vga_register(struct intel_display *display) From c57b796a5f35356837f0895609d45370f883309a Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Wed, 1 Oct 2025 18:42:42 +0100 Subject: [PATCH 04/11] drm/i915: Another legacy path to be nuked Signed-off-by: Dave Stevenson --- drivers/gpu/drm/i915/display/intel_display.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index fac9f65a6948ff..fadae4e410f536 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -7657,6 +7657,7 @@ int intel_atomic_commit(struct drm_device *dev, struct drm_atomic_state *_state, * FIXME: Cut over to (async) commit helpers instead of hand-rolling * everything. */ +#if defined(CONFIG_VGA_CONSOLE) if (state->base.legacy_cursor_update) { struct intel_crtc_state *new_crtc_state; struct intel_crtc *crtc; @@ -7665,7 +7666,7 @@ int intel_atomic_commit(struct drm_device *dev, struct drm_atomic_state *_state, for_each_new_intel_crtc_in_state(state, crtc, new_crtc_state, i) complete_all(&new_crtc_state->uapi.commit->flip_done); } - +#endif ret = intel_atomic_prepare_commit(state); if (ret) { drm_dbg_atomic(display->drm, From c56acfbef67841a7ed0b8b18327bfb2920f181c2 Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Wed, 22 Oct 2025 15:04:41 +0100 Subject: [PATCH 05/11] defconfigs: Add Intel XE graphics driver to 2711 and 2712 defconfigs. Signed-off-by: Dave Stevenson --- arch/arm64/configs/bcm2711_defconfig | 2 ++ arch/arm64/configs/bcm2712_defconfig | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm64/configs/bcm2711_defconfig b/arch/arm64/configs/bcm2711_defconfig index 059aeae24df7c3..5aa6993b269928 100644 --- a/arch/arm64/configs/bcm2711_defconfig +++ b/arch/arm64/configs/bcm2711_defconfig @@ -1074,6 +1074,8 @@ CONFIG_DRM_RADEON=m CONFIG_DRM_AMDGPU=m CONFIG_DRM_AMDGPU_SI=y CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_XE=m +CONFIG_DRM_XE_FORCE_PROBE="*" CONFIG_DRM_UDL=m CONFIG_DRM_PANEL_LVDS=m CONFIG_DRM_PANEL_ILITEK_IL79600A=m diff --git a/arch/arm64/configs/bcm2712_defconfig b/arch/arm64/configs/bcm2712_defconfig index 49c5d679fe8cb7..3e9ddac8a700f9 100644 --- a/arch/arm64/configs/bcm2712_defconfig +++ b/arch/arm64/configs/bcm2712_defconfig @@ -1076,6 +1076,8 @@ CONFIG_DRM_RADEON=m CONFIG_DRM_AMDGPU=m CONFIG_DRM_AMDGPU_SI=y CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_XE=m +CONFIG_DRM_XE_FORCE_PROBE="*" CONFIG_DRM_UDL=m CONFIG_DRM_PANEL_LVDS=m CONFIG_DRM_PANEL_ILITEK_IL79600A=m From ca4b8f3c13cd9bded18043bbc634cc94513b04d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20B=C4=83l=C4=83nic=C4=83?= Date: Wed, 8 Oct 2025 10:45:17 -0400 Subject: [PATCH 06/11] arm64: mm: Handle alignment faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow kernel/user space code to perform unaligned accesses to memory regions that do not normally support them (e.g. device mappings) by trapping alignment faults on common load/store instructions and breaking up the offending accesses into naturally aligned ones. Signed-off-by: Mario Bălănică --- arch/arm64/Kconfig | 9 + arch/arm64/include/asm/exception.h | 1 + arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/alignment.c | 1031 ++++++++++++++++++++++++++++ arch/arm64/mm/fault.c | 6 +- 5 files changed, 1046 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/kernel/alignment.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 61007663ab3f7b..d07efcdb78a854 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1696,6 +1696,15 @@ config ARM64_TAGGED_ADDR_ABI to system calls as pointer arguments. For details, see Documentation/arch/arm64/tagged-address-abi.rst. +config ARM64_ALIGNMENT_FIXUPS + bool "Fix up misaligned multi-word loads and stores in 64-bit kernel/user space" + default y + help + This option enables kernel/user space code to perform unaligned accesses + to memory regions that do not normally support them (e.g. device mappings) + by trapping alignment faults on common load/store instructions and breaking + up the offending accesses into properly aligned ones. + menuconfig COMPAT bool "Kernel support for 32-bit EL0" depends on ARM64_4K_PAGES || EXPERT diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index a2da3cb21c244a..9113ed1150ac12 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -82,6 +82,7 @@ void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr); void do_el0_cp15(unsigned long esr, struct pt_regs *regs); int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs); +int do_alignment_fixup(unsigned long addr, unsigned int esr, struct pt_regs *regs); void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); void do_el0_fpac(struct pt_regs *regs, unsigned long esr); diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 76f32e424065e5..3329d008a6de95 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -36,6 +36,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ syscall.o proton-pack.o idle.o patching.o pi/ \ rsi.o jump_label.o +obj-$(CONFIG_ARM64_ALIGNMENT_FIXUPS) += alignment.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o obj-$(CONFIG_COMPAT) += sigreturn32.o diff --git a/arch/arm64/kernel/alignment.c b/arch/arm64/kernel/alignment.c new file mode 100644 index 00000000000000..0b737102bd20b3 --- /dev/null +++ b/arch/arm64/kernel/alignment.c @@ -0,0 +1,1031 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 Ampere Computing LLC + * Copyright (C) 2025 Mario Bălănică + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 13, 0) +#include +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0) +#include +#endif +#include +#include + +static __always_inline int __aarch64_insn_is_class_ldst(u32 insn) +{ + return (insn & 0x0A000000) == 0x08000000; +} + +static __always_inline int __aarch64_insn_is_dc_zva(u32 insn) +{ + return (insn & 0xFFFFFFE0) == 0xD50B7420; +} + +static int copy_from_user_io(void *to, const void __user *from, unsigned long n) +{ + const u8 __user *src = from; + u8 *dest = to; + + for (; n; n--) + if (get_user(*dest++, src++)) + break; + return n; +} + +static int copy_to_user_io(void __user *to, const void *from, unsigned long n) +{ + const u8 *src = from; + u8 __user *dest = to; + + for (; n; n--) + if (put_user(*src++, dest++)) + break; + return n; +} + +static int align_load(unsigned long addr, int sz, u64 *out) +{ + union { + u8 d8; + u16 d16; + u32 d32; + u64 d64; + char c[8]; + } data; + + if (sz != 1 && sz != 2 && sz != 4 && sz != 8) + return 1; + if (is_ttbr0_addr(addr)) { + if (copy_from_user_io(data.c, (const void __user *)addr, sz)) + return 1; + } else + memcpy_fromio(data.c, (const void __iomem *)addr, sz); + switch (sz) { + case 1: + *out = data.d8; + break; + case 2: + *out = data.d16; + break; + case 4: + *out = data.d32; + break; + case 8: + *out = data.d64; + break; + default: + return 1; + } + return 0; +} + +static int align_store(unsigned long addr, int sz, u64 val) +{ + union { + u8 d8; + u16 d16; + u32 d32; + u64 d64; + char c[8]; + } data; + + switch (sz) { + case 1: + data.d8 = val; + break; + case 2: + data.d16 = val; + break; + case 4: + data.d32 = val; + break; + case 8: + data.d64 = val; + break; + default: + return 1; + } + if (is_ttbr0_addr(addr)) { + if (copy_to_user_io((void __user *)addr, data.c, sz)) + return 1; + } else + memcpy_toio((void __iomem *)addr, data.c, sz); + return 0; +} + +static int align_dc_zva(unsigned long addr, struct pt_regs *regs) +{ + int bs = read_cpuid(DCZID_EL0) & 0xf; + int sz = 1 << (bs + 2); + + addr &= ~(sz - 1); + if (is_ttbr0_addr(addr)) { + for (; sz; sz--) { + if (align_store(addr++, 1, 0)) + return 1; + } + } else + memset_io((void __iomem *)addr, 0, sz); + return 0; +} + +static __always_inline u64 get_vn_dt(int n, int t) +{ + return ((u64 *)¤t->thread.uw.fpsimd_state.vregs[n])[t]; +} + +static __always_inline void set_vn_dt(int n, int t, u64 val) +{ + ((u64 *)¤t->thread.uw.fpsimd_state.vregs[n])[t] = val; +} + +static __always_inline int kernel_neon_wrapper(int (*fn)(u32 insn, struct pt_regs *regs), + u32 insn, struct pt_regs *regs) +{ + int ret; + + kernel_neon_begin(); + ret = fn(insn, regs); + kernel_neon_end(); + + return ret; +} + +static u64 replicate64(u64 val, int bits) +{ + switch (bits) { + case 8: + val = (val << 8) | (val & 0xff); + fallthrough; + case 16: + val = (val << 16) | (val & 0xffff); + fallthrough; + case 32: + val = (val << 32) | (val & 0xffffffff); + break; + default: + break; + } + return val; +} + +static u64 elem_get(u64 hi, u64 lo, int index, int esize) +{ + int shift = index * esize; + u64 mask = GENMASK(esize - 1, 0); + + if (shift < 64) + return (lo >> shift) & mask; + else + return (hi >> (shift - 64)) & mask; +} + +static void elem_set(u64 *hi, u64 *lo, int index, int esize, u64 val) +{ + int shift = index * esize; + u64 mask = GENMASK(esize - 1, 0); + + if (shift < 64) + *lo = (*lo & ~(mask << shift)) | ((val & mask) << shift); + else + *hi = (*hi & ~(mask << (shift - 64))) | ((val & mask) << (shift - 64)); +} + +static int align_ldst_pair(u32 insn, struct pt_regs *regs) +{ + const u32 OPC = GENMASK(31, 30); + const u32 L_MASK = BIT(22); + + int opc = FIELD_GET(OPC, insn); + int L = FIELD_GET(L_MASK, insn); + + bool wback = !!(insn & BIT(23)); + bool postindex = !(insn & BIT(24)); + + int n = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + int t = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + int t2 = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT2, insn); + bool is_store = !L; + bool is_signed = !!(opc & 1); + int scale = 2 + (opc >> 1); + int datasize = 8 << scale; + u64 uoffset = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_7, insn); + s64 offset = sign_extend64(uoffset, 6) << scale; + u64 address; + u64 data1, data2; + u64 dbytes; + + if ((is_store && (opc & 1)) || opc == 3) + return 1; + + if (wback && (t == n || t2 == n) && n != 31) + return 1; + + if (!is_store && t == t2) + return 1; + + dbytes = datasize / 8; + + address = regs_get_register(regs, n << 3); + + if (!postindex) + address += offset; + + if (is_store) { + data1 = pt_regs_read_reg(regs, t); + data2 = pt_regs_read_reg(regs, t2); + if (align_store(address, dbytes, data1) || + align_store(address + dbytes, dbytes, data2)) + return 1; + } else { + if (align_load(address, dbytes, &data1) || + align_load(address + dbytes, dbytes, &data2)) + return 1; + if (is_signed) { + data1 = sign_extend64(data1, datasize - 1); + data2 = sign_extend64(data2, datasize - 1); + } + pt_regs_write_reg(regs, t, data1); + pt_regs_write_reg(regs, t2, data2); + } + + if (wback) { + if (postindex) + address += offset; + if (n == 31) + regs->sp = address; + else + pt_regs_write_reg(regs, n, address); + } + + return 0; +} + +static int align_ldst_pair_simdfp(u32 insn, struct pt_regs *regs) +{ + const u32 OPC = GENMASK(31, 30); + const u32 L_MASK = BIT(22); + + int opc = FIELD_GET(OPC, insn); + int L = FIELD_GET(L_MASK, insn); + + bool wback = !!(insn & BIT(23)); + bool postindex = !(insn & BIT(24)); + + int n = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + int t = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + int t2 = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT2, insn); + bool is_store = !L; + int scale = 2 + opc; + int datasize = 8 << scale; + u64 uoffset = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_7, insn); + s64 offset = sign_extend64(uoffset, 6) << scale; + u64 address; + u64 data1_d0, data1_d1, data2_d0, data2_d1; + u64 dbytes; + + if (opc == 0x3) + return 1; + + if (!is_store && t == t2) + return 1; + + dbytes = datasize / 8; + + address = regs_get_register(regs, n << 3); + + if (!postindex) + address += offset; + + if (is_store) { + data1_d0 = get_vn_dt(t, 0); + data2_d0 = get_vn_dt(t2, 0); + if (datasize == 128) { + data1_d1 = get_vn_dt(t, 1); + data2_d1 = get_vn_dt(t2, 1); + if (align_store(address, 8, data1_d0) || + align_store(address + 8, 8, data1_d1) || + align_store(address + 16, 8, data2_d0) || + align_store(address + 24, 8, data2_d1)) + return 1; + } else { + if (align_store(address, dbytes, data1_d0) || + align_store(address + dbytes, dbytes, data2_d0)) + return 1; + } + } else { + if (datasize == 128) { + if (align_load(address, 8, &data1_d0) || + align_load(address + 8, 8, &data1_d1) || + align_load(address + 16, 8, &data2_d0) || + align_load(address + 24, 8, &data2_d1)) + return 1; + } else { + if (align_load(address, dbytes, &data1_d0) || + align_load(address + dbytes, dbytes, &data2_d0)) + return 1; + data1_d1 = data2_d1 = 0; + } + set_vn_dt(t, 0, data1_d0); + set_vn_dt(t, 1, data1_d1); + set_vn_dt(t2, 0, data2_d0); + set_vn_dt(t2, 1, data2_d1); + } + + if (wback) { + if (postindex) + address += offset; + if (n == 31) + regs->sp = address; + else + pt_regs_write_reg(regs, n, address); + } + + return 0; +} + +static int align_ldst_regoff(u32 insn, struct pt_regs *regs) +{ + const u32 SIZE = GENMASK(31, 30); + const u32 OPC = GENMASK(23, 22); + const u32 OPTION = GENMASK(15, 13); + const u32 S = BIT(12); + + u32 size = FIELD_GET(SIZE, insn); + u32 opc = FIELD_GET(OPC, insn); + u32 option = FIELD_GET(OPTION, insn); + u32 s = FIELD_GET(S, insn); + int scale = size; + int extend_len = (option & 0x1) ? 64 : 32; + bool extend_unsigned = !(option & 0x4); + int shift = s ? scale : 0; + + int n = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + int t = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + int m = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RM, insn); + bool is_store; + bool is_signed; + int regsize; + int datasize; + u64 offset; + u64 address; + u64 data; + + if ((opc & 0x2) == 0) { + /* store or zero-extending load */ + is_store = !(opc & 0x1); + regsize = size == 0x3 ? 64 : 32; + is_signed = false; + } else { + if (size == 0x3) { + if ((opc & 0x1) == 0) { + /* prefetch */ + return 0; + } + /* undefined */ + return 1; + } + /* sign-extending load */ + is_store = false; + if (size == 0x2 && (opc & 0x1) == 0x1) { + /* undefined */ + return 1; + } + regsize = (opc & 0x1) == 0x1 ? 32 : 64; + is_signed = true; + } + + datasize = 8 << scale; + + if (n == t && n != 31) + return 1; + + offset = pt_regs_read_reg(regs, m); + if (extend_len == 32) { + offset &= (u32)~0; + if (!extend_unsigned) + sign_extend64(offset, 31); + } + offset <<= shift; + + address = regs_get_register(regs, n << 3) + offset; + + if (is_store) { + data = pt_regs_read_reg(regs, t); + if (align_store(address, datasize / 8, data)) + return 1; + } else { + if (align_load(address, datasize / 8, &data)) + return 1; + if (is_signed) { + if (regsize == 32) + data = sign_extend32(data, datasize - 1); + else + data = sign_extend64(data, datasize - 1); + } + pt_regs_write_reg(regs, t, data); + } + + return 0; +} + +static int align_ldst_regoff_simdfp(u32 insn, struct pt_regs *regs) +{ + const u32 SIZE = GENMASK(31, 30); + const u32 OPC = GENMASK(23, 22); + const u32 OPTION = GENMASK(15, 13); + const u32 S = BIT(12); + + u32 size = FIELD_GET(SIZE, insn); + u32 opc = FIELD_GET(OPC, insn); + u32 option = FIELD_GET(OPTION, insn); + u32 s = FIELD_GET(S, insn); + int scale = (opc & 0x2) << 1 | size; + int extend_len = (option & 0x1) ? 64 : 32; + bool extend_unsigned = !(option & 0x4); + int shift = s ? scale : 0; + + int n = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + int t = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + int m = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RM, insn); + bool is_store = !(opc & BIT(0)); + int datasize; + u64 offset; + u64 address; + u64 data_d0, data_d1; + + if ((option & 0x2) == 0) + return 1; + + datasize = 8 << scale; + + if (n == t && n != 31) + return 1; + + offset = pt_regs_read_reg(regs, m); + if (extend_len == 32) { + offset &= (u32)~0; + if (!extend_unsigned) + sign_extend64(offset, 31); + } + offset <<= shift; + + address = regs_get_register(regs, n << 3) + offset; + + if (is_store) { + data_d0 = get_vn_dt(t, 0); + if (datasize == 128) { + data_d1 = get_vn_dt(t, 1); + if (align_store(address, 8, data_d0) || + align_store(address + 8, 8, data_d1)) + return 1; + } else { + if (align_store(address, datasize / 8, data_d0)) + return 1; + } + } else { + if (datasize == 128) { + if (align_load(address, 8, &data_d0) || + align_load(address + 8, 8, &data_d1)) + return 1; + } else { + if (align_load(address, datasize / 8, &data_d0)) + return 1; + data_d1 = 0; + } + set_vn_dt(t, 0, data_d0); + set_vn_dt(t, 1, data_d1); + } + + return 0; +} + +static int align_ldst_imm(u32 insn, struct pt_regs *regs) +{ + const u32 SIZE = GENMASK(31, 30); + const u32 OPC = GENMASK(23, 22); + + u32 size = FIELD_GET(SIZE, insn); + u32 opc = FIELD_GET(OPC, insn); + bool wback = !(insn & BIT(24)) && !!(insn & BIT(10)); + bool postindex = wback && !(insn & BIT(11)); + int scale = size; + u64 offset; + + int n = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + int t = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + bool is_store; + bool is_signed; + int regsize; + int datasize; + u64 address; + u64 data; + + if (!(insn & BIT(24))) { + u64 uoffset = + aarch64_insn_decode_immediate(AARCH64_INSN_IMM_9, insn); + offset = sign_extend64(uoffset, 8); + } else { + offset = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, insn); + offset <<= scale; + } + + if ((opc & 0x2) == 0) { + /* store or zero-extending load */ + is_store = !(opc & 0x1); + regsize = size == 0x3 ? 64 : 32; + is_signed = false; + } else { + if (size == 0x3) { + if (FIELD_GET(GENMASK(11, 10), insn) == 0 && (opc & 0x1) == 0) { + /* prefetch */ + return 0; + } + /* undefined */ + return 1; + } + /* sign-extending load */ + is_store = false; + if (size == 0x2 && (opc & 0x1) == 0x1) { + /* undefined */ + return 1; + } + regsize = (opc & 0x1) == 0x1 ? 32 : 64; + is_signed = true; + } + + datasize = 8 << scale; + + if (n == t && n != 31) + return 1; + + address = regs_get_register(regs, n << 3); + + if (!postindex) + address += offset; + + if (is_store) { + data = pt_regs_read_reg(regs, t); + if (align_store(address, datasize / 8, data)) + return 1; + } else { + if (align_load(address, datasize / 8, &data)) + return 1; + if (is_signed) { + if (regsize == 32) + data = sign_extend32(data, datasize - 1); + else + data = sign_extend64(data, datasize - 1); + } + pt_regs_write_reg(regs, t, data); + } + + if (wback) { + if (postindex) + address += offset; + if (n == 31) + regs->sp = address; + else + pt_regs_write_reg(regs, n, address); + } + + return 0; +} + +static int align_ldst_imm_simdfp(u32 insn, struct pt_regs *regs) +{ + const u32 SIZE = GENMASK(31, 30); + const u32 OPC = GENMASK(23, 22); + + u32 size = FIELD_GET(SIZE, insn); + u32 opc = FIELD_GET(OPC, insn); + bool wback = !(insn & BIT(24)) && !!(insn & BIT(10)); + bool postindex = wback && !(insn & BIT(11)); + int scale = (opc & 0x2) << 1 | size; + u64 offset; + + int n = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + int t = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + bool is_store = !(opc & BIT(0)); + int datasize; + u64 address; + u64 data_d0, data_d1; + + if (scale > 4) + return 1; + + if (!(insn & BIT(24))) { + u64 uoffset = + aarch64_insn_decode_immediate(AARCH64_INSN_IMM_9, insn); + offset = sign_extend64(uoffset, 8); + } else { + offset = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_12, insn); + offset <<= scale; + } + + datasize = 8 << scale; + + address = regs_get_register(regs, n << 3); + + if (!postindex) + address += offset; + + if (is_store) { + data_d0 = get_vn_dt(t, 0); + if (datasize == 128) { + data_d1 = get_vn_dt(t, 1); + if (align_store(address, 8, data_d0) || + align_store(address + 8, 8, data_d1)) + return 1; + } else { + if (align_store(address, datasize / 8, data_d0)) + return 1; + } + } else { + if (datasize == 128) { + if (align_load(address, 8, &data_d0) || + align_load(address + 8, 8, &data_d1)) + return 1; + } else { + if (align_load(address, datasize / 8, &data_d0)) + return 1; + data_d1 = 0; + } + set_vn_dt(t, 0, data_d0); + set_vn_dt(t, 1, data_d1); + } + + if (wback) { + if (postindex) + address += offset; + if (n == 31) + regs->sp = address; + else + pt_regs_write_reg(regs, n, address); + } + + return 0; +} + +static int align_ldst_vector_multiple(u32 insn, struct pt_regs *regs) +{ + const u32 Q_MASK = BIT(30); + const u32 L_MASK = BIT(22); + const u32 OPCODE = GENMASK(15, 12); + const u32 SIZE = GENMASK(11, 10); + + u32 Q = FIELD_GET(Q_MASK, insn); + u32 L = FIELD_GET(L_MASK, insn); + u32 opcode = FIELD_GET(OPCODE, insn); + u32 size = FIELD_GET(SIZE, insn); + + int t = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + int n = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + int m = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RM, insn); + bool wback = !!(insn & BIT(23)); + + int datasize = Q ? 128 : 64; + int esize = 8 << size; + int elements = datasize / esize; + int rpt; + int selem; + u64 address; + u64 offs; + u64 rval_d0, rval_d1; + int tt; + int ebytes; + int r; + int e; + int s; + u64 data; + + switch (opcode) { + case 0: // LD/ST4 (4 registers) + rpt = 1; + selem = 4; + break; + case 2: // LD/ST1 (4 registers) + rpt = 4; + selem = 1; + break; + case 4: // LD/ST3 (3 registers) + rpt = 1; + selem = 3; + break; + case 6: // LD/ST1 (3 registers) + rpt = 3; + selem = 1; + break; + case 7: // LD/ST1 (1 register) + rpt = 1; + selem = 1; + break; + case 8: // LD/ST2 (2 registers) + rpt = 1; + selem = 2; + break; + case 10: // LD/ST1 (2 registers) + rpt = 2; + selem = 1; + break; + default: + return 1; + } + + if (size == 3 && Q == 0 && selem != 1) + return 1; + + ebytes = esize / 8; + + address = regs_get_register(regs, n << 3); + + offs = 0; + + for (r = 0; r < rpt; r++) { + for (e = 0; e < elements; e++) { + tt = (t + r) % 32; + for (s = 0; s < selem; s++) { + rval_d0 = get_vn_dt(tt, 0); + rval_d1 = get_vn_dt(tt, 1); + if (L) { + if (align_load(address + offs, ebytes, &data)) + return 1; + elem_set(&rval_d1, &rval_d0, e, esize, data); + set_vn_dt(tt, 0, rval_d0); + set_vn_dt(tt, 1, rval_d1); + } else { + data = elem_get(rval_d1, rval_d0, e, esize); + if (align_store(address + offs, ebytes, data)) + return 1; + } + offs += ebytes; + tt = (tt + 1) % 32; + } + } + } + + if (wback) { + if (m != 31) + offs = regs_get_register(regs, m << 3); + if (n == 31) + regs->sp = address + offs; + else + pt_regs_write_reg(regs, n, address + offs); + } + + return 0; +} + +static int align_ldst_vector_single(u32 insn, struct pt_regs *regs) +{ + const u32 Q_MASK = BIT(30); + const u32 L_MASK = BIT(22); + const u32 R_MASK = BIT(21); + const u32 OPCODE = GENMASK(15, 13); + const u32 S_MASK = BIT(12); + const u32 SIZE = GENMASK(11, 10); + + u32 Q = FIELD_GET(Q_MASK, insn); + u32 L = FIELD_GET(L_MASK, insn); + u32 R = FIELD_GET(R_MASK, insn); + u32 opcode = FIELD_GET(OPCODE, insn); + u32 S = FIELD_GET(S_MASK, insn); + u32 size = FIELD_GET(SIZE, insn); + + int t = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn); + int n = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RN, insn); + int m = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RM, insn); + bool wback = !!(insn & BIT(23)); + + int init_scale = opcode >> 1; + int scale = init_scale; + int selem = (((opcode & 1) << 1) | R) + 1; + bool replicate = false; + int index; + int datasize; + int esize; + u64 address; + u64 offs; + u64 rval_d0, rval_d1; + u64 element; + int ebytes; + int s; + u64 data; + + switch (scale) { + case 3: + if (!L || S) + return 1; + scale = size; + replicate = true; + break; + case 0: + index = (Q << 3) | (S << 2) | size; + break; + case 1: + if (size & 1) + return 1; + index = (Q << 2) | (S << 1) | (size >> 1); + break; + case 2: + if (size & 2) + return 1; + if (!(size & 1)) + index = (Q << 1) | S; + else { + if (S) + return 1; + index = Q; + scale = 3; + } + break; + } + + datasize = Q ? 128 : 64; + esize = 8 << scale; + + ebytes = esize / 8; + + address = regs_get_register(regs, n << 3); + + offs = 0; + + if (replicate) { + for (s = 0; s < selem; s++) { + if (align_load(address + offs, ebytes, &element)) + return 1; + data = replicate64(element, esize); + set_vn_dt(t, 0, data); + if (datasize == 128) + set_vn_dt(t, 1, data); + else + set_vn_dt(t, 1, 0); + offs += ebytes; + t = (t + 1) & 31; + } + } else { + for (s = 0; s < selem; s++) { + rval_d0 = get_vn_dt(t, 0); + rval_d1 = get_vn_dt(t, 1); + if (L) { + if (align_load(address + offs, ebytes, &data)) + return 1; + elem_set(&rval_d1, &rval_d0, index, esize, data); + set_vn_dt(t, 0, rval_d0); + set_vn_dt(t, 1, rval_d1); + } else { + data = elem_get(rval_d1, rval_d0, index, esize); + if (align_store(address + offs, ebytes, data)) + return 1; + } + offs += ebytes; + t = (t + 1) & 31; + } + } + + if (wback) { + if (m != 31) + offs = regs_get_register(regs, m << 3); + if (n == 31) + regs->sp = address + offs; + else + pt_regs_write_reg(regs, n, address + offs); + } + + return 0; +} + +static int align_ldst(u32 insn, struct pt_regs *regs) +{ + const u32 op0 = FIELD_GET(GENMASK(31, 28), insn); + const u32 op1 = FIELD_GET(BIT(26), insn); + const u32 op2 = FIELD_GET(GENMASK(24, 23), insn); + const u32 op3 = FIELD_GET(GENMASK(21, 16), insn); + const u32 op4 = FIELD_GET(GENMASK(11, 10), insn); + + if ((op0 & 0x3) == 0x2) { + /* + * |------+-----+-----+-----+-----+-----------------------------------------| + * | op0 | op1 | op2 | op3 | op4 | Decode group | + * |------+-----+-----+-----+-----+-----------------------------------------| + * | xx10 | - | 00 | - | - | Load/store no-allocate pair (offset) | + * | xx10 | - | 01 | - | - | Load/store register pair (post-indexed) | + * | xx10 | - | 10 | - | - | Load/store register pair (offset) | + * | xx10 | - | 11 | - | - | Load/store register pair (pre-indexed) | + * |------+-----+-----+-----+-----+-----------------------------------------| + */ + + if (op1 == 0) { /* V == 0 */ + /* general */ + return align_ldst_pair(insn, regs); + } + /* simdfp */ + return kernel_neon_wrapper(align_ldst_pair_simdfp, insn, regs); + } else if ((op0 & 0x3) == 0x3 && + (((op2 & 0x2) == 0 && (op3 & 0x20) == 0 && op4 != 0x2) || + ((op2 & 0x2) == 0x2))) { + /* + * |------+-----+-----+--------+-----+---------------------------------------------| + * | op0 | op1 | op2 | op3 | op4 | Decode group | + * |------+-----+-----+--------+-----+---------------------------------------------| + * | xx11 | - | 0x | 0xxxxx | 00 | Load/store register (unscaled immediate) | + * | xx11 | - | 0x | 0xxxxx | 01 | Load/store register (immediate post-indexed | + * | xx11 | - | 0x | 0xxxxx | 11 | Load/store register (immediate pre-indexed) | + * | xx11 | - | 1x | - | - | Load/store register (unsigned immediate) | + * |------+-----+-----+--------+-----+---------------------------------------------| + */ + + if (op1 == 0) { /* V == 0 */ + /* general */ + return align_ldst_imm(insn, regs); + } + /* simdfp */ + return kernel_neon_wrapper(align_ldst_imm_simdfp, insn, regs); + } else if ((op0 & 0x3) == 0x3 && (op2 & 0x2) == 0 && + (op3 & 0x20) == 0x20 && op4 == 0x2) { + /* + * |------+-----+-----+--------+-----+---------------------------------------| + * | op0 | op1 | op2 | op3 | op4 | | + * |------+-----+-----+--------+-----+---------------------------------------| + * | xx11 | - | 0x | 1xxxxx | 10 | Load/store register (register offset) | + * |------+-----+-----+--------+-----+---------------------------------------| + */ + if (op1 == 0) { /* V == 0 */ + /* general */ + return align_ldst_regoff(insn, regs); + } + /* simdfp */ + return kernel_neon_wrapper(align_ldst_regoff_simdfp, insn, regs); + } else if ((op0 & 0xb) == 0 && op1 == 1 && + ((op2 == 0 && op3 == 0) || (op2 == 1 && ((op3 & 0x20) == 0)))) { + /* + * |------+-----+-----+--------+-----+---------------------------------------------| + * | op0 | op1 | op2 | op3 | op4 | | + * |------+-----+-----+--------+-----+---------------------------------------------| + * | 0x00 | 1 | 00 | 000000 | - | Advanced SIMD load/store multiple structure | + * | 0x00 | 1 | 01 | 0xxxxx | - | Advanced SIMD load/store multiple structure | + * | | | | | | (post-indexed) | + * |------+-----+-----+--------+-----+---------------------------------------------| + */ + return kernel_neon_wrapper(align_ldst_vector_multiple, insn, regs); + } else if ((op0 & 0xb) == 0 && op1 == 1 && + ((op2 == 2 && ((op3 & 0x1f) == 0)) || op2 == 3)) { + /* + * |------+-----+-----+--------+-----+-------------------------------------------| + * | op0 | op1 | op2 | op3 | op4 | | + * |------+-----+-----+--------+-----+-------------------------------------------| + * | 0x00 | 1 | 10 | x00000 | - | Advanced SIMD load/store single structure | + * | 0x00 | 1 | 11 | - | - | Advanced SIMD load/store single structure | + * | | | | | | (post-indexed) | + * |------+-----+-----+--------+-----+-------------------------------------------| + */ + return kernel_neon_wrapper(align_ldst_vector_single, insn, regs); + } else + return 1; +} + +int do_alignment_fixup(unsigned long addr, unsigned int esr, + struct pt_regs *regs) +{ + u32 insn; + int res; + + if (user_mode(regs)) { + __le32 insn_le; + + if (!is_ttbr0_addr(addr)) + return 1; + + if (get_user(insn_le, + (__le32 __user *)instruction_pointer(regs))) + return 1; + insn = le32_to_cpu(insn_le); + } else { + if (aarch64_insn_read((void *)instruction_pointer(regs), &insn)) + return 1; + } + + if (__aarch64_insn_is_class_ldst(insn)) + res = align_ldst(insn, regs); + else if (__aarch64_insn_is_dc_zva(insn)) + res = align_dc_zva(addr, regs); + else + res = 1; + + if (!res) { + perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->pc); + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); + } + return res; +} diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a193b6a5d1e65f..4a4e3bc9188c95 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -795,8 +795,10 @@ static int __kprobes do_translation_fault(unsigned long far, static int do_alignment_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { - if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) && - compat_user_mode(regs)) + if (!compat_user_mode(regs)) { + if (IS_ENABLED(CONFIG_ARM64_ALIGNMENT_FIXUPS)) + return do_alignment_fixup(far, esr, regs); + } else if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS)) return do_compat_alignment_fixup(far, regs); do_bad_area(far, esr, regs); return 0; From 8660845506ae701aa9e8beb44411bcc8debcf71c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20B=C4=83l=C4=83nic=C4=83?= Date: Wed, 8 Oct 2025 10:49:31 -0400 Subject: [PATCH 07/11] arm64: mm: Force Device mappings for PCIe MMIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe device drivers may map MMIO space as Normal non-cacheable, for the purpose of enabling write combining or unaligned accesses. On many platforms (e.g. Ampere Altra, RK35xx), the PCIe interface cannot support unaligned outbound transactions. This may lead to data corruption, for instance, when a regular memcpy is performed by an application on a GPU's VRAM BAR. Add an option to force all software that maps PCIe MMIO space as Normal non-cacheable memory to use Device-nGnRE instead. If the strict alignment is not met, the CPU will raise alignment faults that can be further handled by the kernel by enabling CONFIG_ARM64_ALIGNMENT_FIXUPS. Signed-off-by: Mario Bălănică --- arch/arm64/Kconfig | 17 ++++++++++++++++ arch/arm64/include/asm/pgtable.h | 20 ++++++++++++++----- kernel/resource.c | 33 ++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d07efcdb78a854..6c3d97821b0376 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1696,6 +1696,23 @@ config ARM64_TAGGED_ADDR_ABI to system calls as pointer arguments. For details, see Documentation/arch/arm64/tagged-address-abi.rst. +config ARM64_FORCE_PCIE_MMIO_DEVICE_MAPPINGS + bool "Force Device memory mappings for PCIe MMIO space" + default y + help + PCIe device drivers may map MMIO space as Normal non-cacheable, + for the purpose of enabling write combining or unaligned accesses. + + On many platforms (e.g. Ampere Altra, RK35xx), the PCIe interface + cannot support unaligned outbound transactions. This may lead to + data corruption, for instance, when a regular memcpy is performed by + an application on a GPU's VRAM BAR. + + This option forces all software that maps PCIe MMIO space as Normal + non-cacheable memory to use Device-nGnRE instead. If the strict alignment + is not met, the CPU will raise alignment faults that can be further + handled by the kernel by enabling CONFIG_ARM64_ALIGNMENT_FIXUPS. + config ARM64_ALIGNMENT_FIXUPS bool "Fix up misaligned multi-word loads and stores in 64-bit kernel/user space" default y diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 9016ae8de5c9e2..7771b466c4f6d9 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -338,11 +338,6 @@ static inline pte_t pte_mkyoung(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_AF)); } -static inline pte_t pte_mkspecial(pte_t pte) -{ - return set_pte_bit(pte, __pgprot(PTE_SPECIAL)); -} - static inline pte_t pte_mkcont(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_CONT)); @@ -802,6 +797,21 @@ static inline void __set_puds(struct mm_struct *mm, __pgprot_modify(prot, PTE_ATTRINDX_MASK, \ PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) +extern bool range_is_pci(phys_addr_t phys_addr, size_t size); + +static inline pte_t pte_mkspecial(pte_t pte) +{ +#ifdef CONFIG_ARM64_FORCE_PCIE_MMIO_DEVICE_MAPPINGS + phys_addr_t phys = __pte_to_phys(pte); + pgprot_t prot = __pgprot(pte_val(pte) & ~__phys_to_pte_val(__pte_to_phys(__pte(~0ull)))); + + if ((pgprot_val(prot) != pgprot_val(pgprot_device(prot))) && + range_is_pci(phys, PAGE_SIZE)) + pte = __pte(__phys_to_pte_val(phys) | pgprot_val(pgprot_device(prot))); +#endif + return set_pte_bit(pte, __pgprot(PTE_SPECIAL)); +} + #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, diff --git a/kernel/resource.c b/kernel/resource.c index edbe8ef7e8efd6..4ac91b689d75bf 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -389,6 +389,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, .flags = p->flags, .desc = p->desc, .parent = p->parent, + .name = p->name, }; } @@ -566,6 +567,38 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); +static int pci_res_check(struct resource *res, void *arg) +{ + if (!res->name) + return 1; + + return strncmp(res->name, "PCI", 3); +} + +bool range_is_pci(phys_addr_t phys_addr, size_t size) +{ + u64 start, end; + int ret; + + start = phys_addr; + end = phys_addr + size; + + /* Check 32-bit MMIO */ + ret = walk_iomem_res_desc(IORES_DESC_NONE, IORESOURCE_MEM, + start, end, NULL, pci_res_check); + if (!ret) + return true; + + /* Check 64-bit MMIO */ + ret = walk_iomem_res_desc(IORES_DESC_NONE, IORESOURCE_MEM_64, + start, end, NULL, pci_res_check); + if (!ret) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(range_is_pci); + static int __region_intersects(struct resource *parent, resource_size_t start, size_t size, unsigned long flags, unsigned long desc) From c5322426b1d936d6679844b1416d4a7792df59c5 Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Thu, 8 Jan 2026 14:58:40 +0000 Subject: [PATCH 08/11] fixup "arm64: mm: Force Device mappings for PCIe MMIO" for other architectures Avoid the warning "no previous prototype for 'range_is_pci' when building for ARM or other architectures. Signed-off-by: Dave Stevenson --- kernel/resource.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/resource.c b/kernel/resource.c index 4ac91b689d75bf..c6d53ddbcbf468 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -567,6 +567,7 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); +#ifdef CONFIG_ARM64 static int pci_res_check(struct resource *res, void *arg) { if (!res->name) @@ -598,6 +599,7 @@ bool range_is_pci(phys_addr_t phys_addr, size_t size) return false; } EXPORT_SYMBOL_GPL(range_is_pci); +#endif static int __region_intersects(struct resource *parent, resource_size_t start, size_t size, unsigned long flags, From 2516b9fc3c77d430652f0f945845475ab9380b51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20B=C4=83l=C4=83nic=C4=83?= Date: Wed, 8 Oct 2025 11:37:50 -0400 Subject: [PATCH 09/11] drm: Force write-combined mappings for DMA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe GPU device drivers may use normal cached mappings for DMA memory. This requires the PCIe interface to be coherent with the CPU caches, which is not supported by many Arm platforms (e.g. RK35xx), leading to data corruption on inbound transactions. Add an option to force write-combined mappings instead (Normal non-cacheable on Arm). Note that this is just a band-aid to keep the patch small. The TTM allocator should frankly not be concerned with hardware limitations and always pass the requested caching type (a driver could still use cached memory and perform its own cache maintenance). A proper solution would be for GPU drivers to check whether the device supports coherency and request the appropriate caching type. The drm_arch_can_wc_memory() helper also needs to be reworked or possibly even dropped. Signed-off-by: Mario Bălănică --- drivers/gpu/drm/Kconfig | 16 ++++++++++++++++ drivers/gpu/drm/ttm/ttm_tt.c | 4 ++++ include/drm/drm_cache.h | 3 +++ 3 files changed, 23 insertions(+) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 5b1dee660a07d2..7b80f71364ad23 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -183,6 +183,22 @@ config DRM_LOAD_EDID_FIRMWARE default case is N. Details and instructions how to build your own EDID data are given in Documentation/admin-guide/edid.rst. +config DRM_FORCE_DMA_WRITE_COMBINED_MAPPINGS + bool "Force write-combined mappings for DMA" + default y + help + PCIe GPU device drivers may use normal cached mappings for DMA memory. + + This requires the PCIe interface to be coherent with the CPU caches, + which is not supported by many Arm platforms (e.g. RK35xx), leading to + data corruption on inbound transactions. + + Enable this option to force write-combined mappings instead (Normal + non-cacheable on Arm). + + Disable if the hardware supports coherency, as it might cause issues on + certain platforms that ignore the PCIe NoSnoop TLP attribute. + source "drivers/gpu/drm/display/Kconfig" config DRM_TTM diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index 506e257dfba850..f1d31e5cd9143f 100644 --- a/drivers/gpu/drm/ttm/ttm_tt.c +++ b/drivers/gpu/drm/ttm/ttm_tt.c @@ -154,6 +154,10 @@ static void ttm_tt_init_fields(struct ttm_tt *ttm, enum ttm_caching caching, unsigned long extra_pages) { +#ifdef CONFIG_DRM_FORCE_DMA_WRITE_COMBINED_MAPPINGS + if (caching == ttm_cached) + caching = ttm_write_combined; +#endif ttm->num_pages = (PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT) + extra_pages; ttm->page_flags = page_flags; ttm->dma_address = NULL; diff --git a/include/drm/drm_cache.h b/include/drm/drm_cache.h index 08e0e3ffad1319..ce86c55365a314 100644 --- a/include/drm/drm_cache.h +++ b/include/drm/drm_cache.h @@ -45,6 +45,9 @@ bool drm_need_swiotlb(int dma_bits); static inline bool drm_arch_can_wc_memory(void) { +#ifdef CONFIG_DRM_FORCE_DMA_WRITE_COMBINED_MAPPINGS + return true; +#endif #if defined(CONFIG_PPC) && !defined(CONFIG_NOT_COHERENT_CACHE) return false; #elif defined(CONFIG_MIPS) && defined(CONFIG_CPU_LOONGSON64) From 7705cb490aa272c825739e4c150d6df0e792e440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20B=C4=83l=C4=83nic=C4=83?= Date: Mon, 27 Oct 2025 21:57:50 -0400 Subject: [PATCH 10/11] drm/nouveau: Support non-coherent hardware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mario Bălănică --- drivers/gpu/drm/nouveau/nouveau_drv.h | 3 +++ drivers/gpu/drm/nouveau/nouveau_sgdma.c | 4 +-- .../gpu/drm/nouveau/nvkm/engine/device/pci.c | 3 ++- drivers/gpu/drm/nouveau/nvkm/subdev/fb/base.c | 14 ++++------ .../drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c | 26 ++++++++++--------- drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.c | 8 +++++- 6 files changed, 32 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h index 55abc510067bc8..bb07eedc9a8a80 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drv.h +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h @@ -317,6 +317,9 @@ nouveau_drm(struct drm_device *dev) static inline bool nouveau_drm_use_coherent_gpu_mapping(struct nouveau_drm *drm) { +#ifdef CONFIG_DRM_FORCE_DMA_WRITE_COMBINED_MAPPINGS + return false; +#endif struct nvif_mmu *mmu = &drm->client.mmu; return !(mmu->type[drm->ttm.type_host[0]].type & NVIF_MEM_UNCACHED); } diff --git a/drivers/gpu/drm/nouveau/nouveau_sgdma.c b/drivers/gpu/drm/nouveau/nouveau_sgdma.c index bd870028514b66..f3a1902bc7e0d9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sgdma.c +++ b/drivers/gpu/drm/nouveau/nouveau_sgdma.c @@ -72,9 +72,7 @@ nouveau_sgdma_create_ttm(struct ttm_buffer_object *bo, uint32_t page_flags) struct nouveau_sgdma_be *nvbe; enum ttm_caching caching; - if (nvbo->force_coherent) - caching = ttm_uncached; - else if (drm->agp.bridge) + if (nvbo->force_coherent || drm->agp.bridge) caching = ttm_write_combined; else caching = ttm_cached; diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c index 4c29b60460d48d..9c7d14536ab672 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/pci.c @@ -1645,7 +1645,8 @@ nvkm_device_pci_func = { .irq = nvkm_device_pci_irq, .resource_addr = nvkm_device_pci_resource_addr, .resource_size = nvkm_device_pci_resource_size, - .cpu_coherent = !IS_ENABLED(CONFIG_ARM), + .cpu_coherent = !IS_ENABLED(CONFIG_ARM) && + !IS_ENABLED(CONFIG_DRM_FORCE_DMA_WRITE_COMBINED_MAPPINGS), }; int diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/base.c index 7ce1b65e2c1c29..23681ed7e23899 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/base.c @@ -248,9 +248,8 @@ nvkm_fb_dtor(struct nvkm_subdev *subdev) nvkm_falcon_fw_dtor(&fb->vpr_scrubber); if (fb->sysmem.flush_page) { - dma_unmap_page(subdev->device->dev, fb->sysmem.flush_page_addr, - PAGE_SIZE, DMA_BIDIRECTIONAL); - __free_page(fb->sysmem.flush_page); + dma_free_coherent(subdev->device->dev, PAGE_SIZE, + fb->sysmem.flush_page, fb->sysmem.flush_page_addr); } if (fb->func->dtor) @@ -279,14 +278,11 @@ nvkm_fb_ctor(const struct nvkm_fb_func *func, struct nvkm_device *device, mutex_init(&fb->tags.mutex); if (func->sysmem.flush_page_init) { - fb->sysmem.flush_page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO); + fb->sysmem.flush_page = dma_alloc_coherent(device->dev, PAGE_SIZE, + &fb->sysmem.flush_page_addr, + GFP_KERNEL | __GFP_ZERO); if (!fb->sysmem.flush_page) return -ENOMEM; - - fb->sysmem.flush_page_addr = dma_map_page(device->dev, fb->sysmem.flush_page, - 0, PAGE_SIZE, DMA_BIDIRECTIONAL); - if (dma_mapping_error(device->dev, fb->sysmem.flush_page_addr)) - return -EFAULT; } return 0; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c index a575a8dbf727df..1b6a1009d33f3c 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/gsp.c @@ -1571,12 +1571,12 @@ nvkm_gsp_sg_free(struct nvkm_device *device, struct sg_table *sgt) struct scatterlist *sgl; int i; - dma_unmap_sgtable(device->dev, sgt, DMA_BIDIRECTIONAL, 0); - for_each_sgtable_sg(sgt, sgl, i) { - struct page *page = sg_page(sgl); + void *cpu_addr = sg_virt(sgl); + dma_addr_t dma_addr = sg_dma_address(sgl); - __free_page(page); + if (cpu_addr && dma_addr) + dma_free_coherent(device->dev, PAGE_SIZE, cpu_addr, dma_addr); } sg_free_table(sgt); @@ -1594,21 +1594,23 @@ nvkm_gsp_sg(struct nvkm_device *device, u64 size, struct sg_table *sgt) return ret; for_each_sgtable_sg(sgt, sgl, i) { - struct page *page = alloc_page(GFP_KERNEL); + void *cpu_addr; + dma_addr_t dma_addr; - if (!page) { + cpu_addr = dma_alloc_coherent(device->dev, PAGE_SIZE, + &dma_addr, GFP_KERNEL); + if (!cpu_addr) { nvkm_gsp_sg_free(device, sgt); return -ENOMEM; } - sg_set_page(sgl, page, PAGE_SIZE, 0); + /* XXX: unsafe to use virt_to_page with dma_alloc_coherent */ + sg_set_page(sgl, virt_to_page(cpu_addr), PAGE_SIZE, 0); + sg_dma_address(sgl) = dma_addr; + sg_dma_len(sgl) = PAGE_SIZE; } - ret = dma_map_sgtable(device->dev, sgt, DMA_BIDIRECTIONAL, 0); - if (ret) - nvkm_gsp_sg_free(device, sgt); - - return ret; + return 0; } static void diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.c index 92e363dbbc5a6e..21438193b967f7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/mem.c @@ -133,8 +133,14 @@ int nvkm_mem_map_host(struct nvkm_memory *memory, void **pmap) { struct nvkm_mem *mem = nvkm_mem(memory); + pgprot_t prot = PAGE_KERNEL; + +#ifdef CONFIG_DRM_FORCE_DMA_WRITE_COMBINED_MAPPINGS + prot = pgprot_writecombine(prot); +#endif + if (mem->mem) { - *pmap = vmap(mem->mem, mem->pages, VM_MAP, PAGE_KERNEL); + *pmap = vmap(mem->mem, mem->pages, VM_MAP, prot); return *pmap ? 0 : -EFAULT; } return -EINVAL; From baf52e3ebe4fd82d10f9b6899c3c7280e2e8d4f2 Mon Sep 17 00:00:00 2001 From: Dave Stevenson Date: Thu, 8 Jan 2026 13:38:56 +0000 Subject: [PATCH 11/11] defconfig: Add Nouveau to the arm64/bcm2711_defconfig Signed-off-by: Dave Stevenson --- arch/arm64/configs/bcm2711_defconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/configs/bcm2711_defconfig b/arch/arm64/configs/bcm2711_defconfig index 5aa6993b269928..ca814b94ac6070 100644 --- a/arch/arm64/configs/bcm2711_defconfig +++ b/arch/arm64/configs/bcm2711_defconfig @@ -1074,6 +1074,9 @@ CONFIG_DRM_RADEON=m CONFIG_DRM_AMDGPU=m CONFIG_DRM_AMDGPU_SI=y CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_NOUVEAU=m +CONFIG_NOUVEAU_DEBUG_MMU=y +CONFIG_NOUVEAU_DEBUG_PUSH=y CONFIG_DRM_XE=m CONFIG_DRM_XE_FORCE_PROBE="*" CONFIG_DRM_UDL=m