diff --git a/arch.mk b/arch.mk index beeffa4122..65fb84ad51 100644 --- a/arch.mk +++ b/arch.mk @@ -83,6 +83,42 @@ ifeq ($(ARCH),AARCH64) CFLAGS+=-DWOLFBOOT_ZYNQMP_CSU endif + ifeq ($(ZYNQMP_FSBL),1) + # wolfBoot fully replaces the Xilinx FSBL: the BootROM authenticates and + # loads wolfBoot into OCM at EL3, and wolfBoot runs psu_init() to bring up + # the PLLs/DDR/MIO/clocks before loading the downstream images. wolfBoot + # therefore links and runs entirely from OCM (DDR is not up at entry). + # + # The board-specific psu_init_gpl.c / psu_init_gpl.h (generated from the + # XSA, Xilinx copyright) are supplied at build time from + # ZYNQMP_PSU_INIT_DIR and are NOT part of the wolfBoot tree. The + # hal/zynqmp/ shim headers (xil_io.h, sleep.h) let that unmodified file + # compile. Set EL3_SECURE=1 in the target .config. + ZYNQMP_PSU_INIT_DIR?=hal/board/zynqmp + CFLAGS+=-DWOLFBOOT_ZYNQMP_FSBL + CFLAGS+=-Ihal/zynqmp -I$(ZYNQMP_PSU_INIT_DIR) + LSCRIPT_IN=hal/zynqmp_ocm.ld + OBJS+=hal/zynqmp_psu_shim.o + OBJS+=hal/zynqmp_atf.o + OBJS+=$(ZYNQMP_PSU_INIT_DIR)/psu_init_gpl.o + + # Load the PMU configuration object (EEMI permission table) into PMU + # firmware so the APU can control the SoC nodes. Like psu_init_gpl.c the + # pm_cfg_obj.c is design-specific and supplied from ZYNQMP_PSU_INIT_DIR. + ifeq ($(ZYNQMP_PM_CFG),1) + CFLAGS+=-DWOLFBOOT_ZYNQMP_PM_CFG + OBJS+=$(ZYNQMP_PSU_INIT_DIR)/pm_cfg_obj.o + endif + + # Run the PS-GTR serdes init (USB3/SATA/PCIe/DP PHY lanes). Required if the + # kernel drives a PS-GTR peripheral (e.g. USB3 dwc3, whose probe hangs on + # an unclocked PHY). Skipped by default since QSPI/SD/RGMII boot needs no + # serdes; the shim runs the full calibrated sequence when enabled. + ifeq ($(ZYNQMP_PSU_INIT_SERDES),1) + CFLAGS+=-DZYNQMP_PSU_INIT_SERDES + endif + endif + endif ifeq ($(TARGET),versal) diff --git a/config/examples/zynqmp_fsbl.config b/config/examples/zynqmp_fsbl.config new file mode 100644 index 0000000000..3b2e80577d --- /dev/null +++ b/config/examples/zynqmp_fsbl.config @@ -0,0 +1,114 @@ +# wolfBoot configuration for AMD ZynqMP ZCU102 - FSBL REPLACEMENT (QSPI boot) +# Zynq UltraScale+ MPSoC ZU9EG - Quad-core ARM Cortex-A53 +# +# wolfBoot replaces the Xilinx First Stage Boot Loader (FSBL). The BootROM +# (optionally with eFuse PPK RSA secure boot) authenticates and loads wolfBoot +# directly into OCM at 0xFFFC0000 and enters it at EL3. wolfBoot then: +# 1. runs the board psu_init() (PLLs, DDR, MIO mux, clocks) +# 2. loads + verifies the downstream images with its OWN signing keys +# 3. (Milestone 1) hands off to real ARM Trusted Firmware (BL31) which keeps +# EL3/PSCI and drops to Linux. +# PMUFW is still loaded by the BootROM via the [pmufw_image] BIF tag. +# +# Boot flow: BootROM -> wolfBoot (OCM/EL3) -> psu_init -> verify+load +# BL31 + kernel + DTB -> BL31 -> Linux +# +# The board-specific psu_init_gpl.c / psu_init_gpl.h (generated from the XSA, +# Xilinx copyright) must be supplied at build time. By default they are read +# from hal/board/zynqmp/ (override with ZYNQMP_PSU_INIT_DIR=...). They are not +# part of the wolfBoot tree. Build with: +# make ZYNQMP_FSBL=1 ZYNQMP_PSU_INIT_DIR=/path/to/board +# Package with tools/scripts/zcu102/zynqmp_wolfboot_fsbl_qspi.bif + +ARCH?=AARCH64 +TARGET?=zynq + +# Enable the FSBL-replacement build: link/run from OCM, run psu_init at boot, +# pull in the board psu_init_gpl.o + the hal/zynqmp shim. +ZYNQMP_FSBL?=1 +ZYNQMP_PSU_INIT_DIR?=hal/board/zynqmp + +# Load the PMU configuration object (EEMI permission table) into PMU firmware +# so the APU can control SoC nodes; needs hal/board/zynqmp/pm_cfg_obj.c. +ZYNQMP_PM_CFG?=1 + +# Run the PS-GTR serdes init so USB3/SATA/PCIe/DP PHY lanes are clocked. +# Needed for the kernel's USB (dwc3) probe not to hang. Set 0 to skip (QSPI/SD +# boot does not need serdes). +ZYNQMP_PSU_INIT_SERDES?=1 + +# BootROM enters the [bootloader] partition at EL3. Run wolfBoot at EL3. +CFLAGS_EXTRA+=-DEL3_SECURE=1 + +# DDR-training delay correction. The BootROM leaves the IOU_SCNTRS system +# counter running at the ~1.6GHz pre-divider rate during psu_init (the /15 +# TIMESTAMP_REF_CTRL divisor does not re-latch the already-running counter), +# while usleep() computes its tick budget at the nominal 100MHz timestamp +# clock. Without correction every psu_ddr_phybringup settle delay is ~16x too +# short and DDR PHY training comes up marginal. Hardware-measured: a 16x scale +# restores correct training. See hal/zynqmp_psu_shim.c. +CFLAGS_EXTRA+=-DZYNQMP_USLEEP_SCALE=16 + +WOLFBOOT_VERSION?=0 + +# RSA 4096-bit with SHA3-384 (downstream image signing) +SIGN?=RSA4096 +HASH?=SHA3 +IMAGE_HEADER_SIZE?=1024 + +# Software ARMv8+Crypto assembly for SHA3 (no CSU/PMU SMC available at EL3 +# until BL31 is resident). +NO_ARM_ASM?=0 +HW_SHA3?=0 + +DEBUG?=0 +DEBUG_SYMBOLS=1 +DEBUG_UART=1 +CFLAGS_EXTRA+=-DDEBUG_ZYNQ=1 + +VTOR?=1 +CORTEX_M0?=0 +NO_ASM?=0 +ALLOW_DOWNGRADE?=0 +NVM_FLASH_WRITEONCE?=0 +V?=0 +SPMATH?=1 +RAM_CODE?=0 +DUALBANK_SWAP?=0 +PKA?=0 +WOLFTPM?=0 + +# Downstream images are read from QSPI (not XIP; wolfBoot executes from OCM). +EXT_FLASH?=1 +SPI_FLASH?=0 +NO_XIP=1 +USE_GCC=1 +ELF?=1 + +# Native gzip decompression for FIT subimages +GZIP?=1 + +# Flash Sector Size +WOLFBOOT_SECTOR_SIZE=0x20000 +# Application Partition Size +WOLFBOOT_PARTITION_SIZE=0x2A00000 +# wolfBoot self-location (OCM). Must match ORIGIN in hal/zynqmp_ocm.ld. +WOLFBOOT_ORIGIN=0xFFFC0000 +# Location in QSPI for Primary Boot Partition (OS payload: FIT kernel+DTB) +WOLFBOOT_PARTITION_BOOT_ADDRESS?=0x800000 +# Load Partition to RAM Address (DDR, after psu_init) +WOLFBOOT_LOAD_ADDRESS?=0x10000000 +# Location in QSPI for Secondary (update) Partition +WOLFBOOT_PARTITION_UPDATE_ADDRESS?=0x3A00000 +# Location to store wolfBoot state +WOLFBOOT_PARTITION_SWAP_ADDRESS?=0x63E0000 + +# DTS (Device Tree) +WOLFBOOT_LOAD_DTS_ADDRESS?=0x11800000 +WOLFBOOT_DTS_BOOT_ADDRESS?=0x7B0000 +WOLFBOOT_DTS_UPDATE_ADDRESS?=0x39B0000 + +CROSS_COMPILE=aarch64-none-elf- + +# Speed up reads from flash by using larger blocks +CFLAGS_EXTRA+=-DWOLFBOOT_SHA_BLOCK_SIZE=4096 diff --git a/config/examples/zynqmp_fsbl_sd.config b/config/examples/zynqmp_fsbl_sd.config new file mode 100644 index 0000000000..c3098a713b --- /dev/null +++ b/config/examples/zynqmp_fsbl_sd.config @@ -0,0 +1,83 @@ +# wolfBoot configuration for AMD ZynqMP ZCU102 - FSBL REPLACEMENT (SD boot) +# Zynq UltraScale+ MPSoC ZU9EG - Quad-core ARM Cortex-A53 +# +# Same FSBL-replacement role as config/examples/zynqmp_fsbl.config, but the +# BootROM loads wolfBoot from BOOT.BIN on the SD card FAT partition, and +# wolfBoot reads the downstream images from MBR partitions on the same card. +# +# Boot flow: BootROM -> wolfBoot (OCM/EL3) -> psu_init -> verify+load +# BL31 + kernel + DTB (from SD) -> BL31 -> Linux +# +# Supply the board psu_init_gpl.c/.h at build time (see zynqmp_fsbl.config). +# Build: make ZYNQMP_FSBL=1 ZYNQMP_PSU_INIT_DIR=/path/to/board +# Package: tools/scripts/zcu102/zynqmp_wolfboot_fsbl_sd.bif + +ARCH?=AARCH64 +TARGET?=zynq + +ZYNQMP_FSBL?=1 +ZYNQMP_PSU_INIT_DIR?=hal/board/zynqmp + +# BootROM enters the [bootloader] partition at EL3. +CFLAGS_EXTRA+=-DEL3_SECURE=1 + +WOLFBOOT_VERSION?=0 + +SIGN?=RSA4096 +HASH?=SHA3 +IMAGE_HEADER_SIZE?=1024 + +NO_ARM_ASM?=0 +HW_SHA3?=0 + +DEBUG?=0 +DEBUG_SYMBOLS=1 +DEBUG_UART=1 +CFLAGS_EXTRA+=-DDEBUG_ZYNQ=1 + +# SD card support - use SDHCI driver (SD1 external slot on ZCU102) +DISK_SDCARD?=1 +DISK_EMMC?=0 +CFLAGS_EXTRA+=-DSDHCI_FORCE_CARD_DETECT + +# No QSPI in the SD configuration +EXT_FLASH?=0 +NO_XIP=1 + +ELF?=1 +GZIP?=1 + +VTOR?=1 +CORTEX_M0?=0 +NO_ASM?=0 +ALLOW_DOWNGRADE?=0 +NVM_FLASH_WRITEONCE?=0 +V?=0 +SPMATH?=1 +RAM_CODE?=0 +DUALBANK_SWAP?=0 +PKA?=0 +WOLFTPM?=0 + +USE_GCC=1 +CROSS_COMPILE=aarch64-none-elf- + +# MBR partition layout (same as zynqmp_sdcard.config): +# part[0]=boot (FAT32, BOOT.BIN), part[1]=OFP_A, part[2]=OFP_B, part[3]=rootfs +WOLFBOOT_NO_PARTITIONS=1 +CFLAGS_EXTRA+=-DBOOT_PART_A=1 +CFLAGS_EXTRA+=-DBOOT_PART_B=2 +CFLAGS_EXTRA+=-DDISK_BLOCK_SIZE=0x80000 +CFLAGS_EXTRA+=-DLINUX_BOOTARGS_ROOT=\"/dev/mmcblk0p4\" + +# wolfBoot self-location (OCM). Must match ORIGIN in hal/zynqmp_ocm.ld. +WOLFBOOT_ORIGIN=0xFFFC0000 +# Load Partition to RAM Address (DDR, after psu_init) +WOLFBOOT_LOAD_ADDRESS?=0x10000000 +# DTS (Device Tree) load address +WOLFBOOT_LOAD_DTS_ADDRESS?=0x1000 + +# Required for test-app (even with WOLFBOOT_NO_PARTITIONS=1) +WOLFBOOT_PARTITION_BOOT_ADDRESS=0x80200000 +WOLFBOOT_PARTITION_SIZE=0x4000000 +WOLFBOOT_SECTOR_SIZE=0x1000 diff --git a/docs/Targets.md b/docs/Targets.md index 0f2cbb352d..e0e1137fdd 100644 --- a/docs/Targets.md +++ b/docs/Targets.md @@ -3709,6 +3709,58 @@ This target supports **two boot paths**: - **QSPI boot** (primary, production-style): `config/examples/zynqmp.config` - **SD card boot** (MBR, A/B images): `config/examples/zynqmp_sdcard.config` +In both of the above, wolfBoot is loaded *by* the Xilinx FSBL and runs at EL2. wolfBoot can also replace the FSBL entirely -- see the next section. + +### wolfBoot as a full FSBL replacement (`ZYNQMP_FSBL=1`) + +Instead of being loaded by the Xilinx FSBL, wolfBoot can *be* the FSBL. The BootROM authenticates and loads wolfBoot directly (as the boot-image `[bootloader]` partition) into the 256 KB OCM at `0xFFFC0000` and enters it at EL3. wolfBoot then runs the board `psu_init()` (PLLs, DDR controller + PHY training, MIO mux, clocks) and loads + verifies the downstream images with its own keys: + +``` +BootROM -> wolfBoot (OCM, EL3) -> psu_init -> verify+load BL31 + kernel + DTB -> BL31 (EL3) -> Linux +``` + +PMUFW is still loaded by the BootROM via the `[pmufw_image]` BIF tag. Milestone 1 keeps real ARM Trusted Firmware (BL31) as the resident EL3 monitor. + +**Board PS init (psu_init) drop-in.** DDR controller init and PHY training are board/XSA specific and are generated by the Xilinx tools; they carry the Xilinx copyright and are not part of the wolfBoot tree. Copy your generated `psu_init_gpl.c` and `psu_init_gpl.h` into `hal/board/zynqmp/` (gitignored) or point `ZYNQMP_PSU_INIT_DIR` at them. For the ZCU102 these come from PetaLinux/Vitis (`project-spec/hw-description/psu_init_gpl.*`). See `hal/board/zynqmp/README.txt`. wolfBoot-owned shims at `hal/zynqmp/` (`xil_io.h`, `sleep.h`) let the unmodified file compile. + +**Build.** Link/run from OCM and run psu_init at boot: + +```sh +make ZYNQMP_FSBL=1 # QSPI: config/examples/zynqmp_fsbl.config +make ZYNQMP_FSBL=1 \ + ZYNQMP_PSU_INIT_DIR=/path/to/hw-description # if not in hal/board/zynqmp/ +``` + +The SD variant is `config/examples/zynqmp_fsbl_sd.config`. wolfBoot is ~197 KB of the 256 KB OCM (single-stage; wolfBoot's own code does not need DDR). + +**Package the boot image.** wolfBoot is the bootloader; there is no `zynqmp_fsbl.elf`: + +```sh +cp wolfboot.elf pmufw.elf tools/scripts/zcu102/ # pmufw.elf from PetaLinux/Vitis +cd tools/scripts/zcu102 +bootgen -arch zynqmp -image zynqmp_wolfboot_fsbl.bif -w -o BOOT.BIN +``` + +Use `zynqmp_wolfboot_fsbl_auth.bif` for the Xilinx hardware root of trust (RSA authentication via the eFuse PPK; see comments in that file). + +**Flash QSPI** (over JTAG, any boot mode; `program_flash` is under the Vitis `bin/`): + +```sh +program_flash -f BOOT.BIN -fsbl zynqmp_fsbl.elf \ + -flash_type qspi-x8-dual_parallel -flash_density 1024 \ + -verify -url tcp:127.0.0.1:3121 +``` + +`zynqmp_fsbl.elf` here is the stock Xilinx FSBL used only as the `program_flash` flash-writer bootstrap (it is not flashed). Then set SW6 to QSPI32 and power-cycle. + +**SD card.** Write `BOOT.BIN` to the FAT (boot) partition of the SD card, set SW6 to SD, power-cycle. + +**Boot mode switches (SW6).** See the SD-card SW6 table below (JTAG `0000`, QSPI32 `0010`, SD1 `1110`). + +**Downstream images (direct-kernel BL33).** The boot image wolfBoot loads is a wolfBoot-signed FIT containing an `atf` (BL31) sub-image plus the kernel and DTB. Two requirements: build BL31 with its link base in DDR (the default OCM `0xFFFE0000` overlaps wolfBoot), and, because BL33 is the kernel directly (no U-Boot), apply `tools/scripts/zcu102/tf-a-zynqmp-wolfboot-dtb.patch` so BL31 forwards the DTB (which wolfBoot publishes in `PMU_GLOBAL.GLOBAL_GEN_STORAGE5`) into the kernel's `x0`. + +**Validation status (ZCU102).** The FSBL path is hardware-validated: wolfBoot runs at `Current EL: 3`, cold-boot `psu_init` brings up DDR (read/write tested), clocks, MIO, UART, and QSPI (flash ID read), from both a JTAG load and a real BootROM QSPI cold boot. The full downstream chain to Linux (FIT + DDR-linked BL31 + the TF-A patch) is in progress. + ### Prerequisites 1. **Xilinx Vitis 2024.1 or newer** diff --git a/hal/board/zynqmp/.gitignore b/hal/board/zynqmp/.gitignore new file mode 100644 index 0000000000..565d960f99 --- /dev/null +++ b/hal/board/zynqmp/.gitignore @@ -0,0 +1,8 @@ +# The board PS init is generated from your XSA by the Xilinx tools and carries +# the Xilinx copyright; it is board/design specific and is NOT tracked in the +# wolfBoot tree. Drop your generated files here (see README.txt). +psu_init_gpl.c +psu_init_gpl.h +psu_init.c +psu_init.h +*.o diff --git a/hal/board/zynqmp/README.txt b/hal/board/zynqmp/README.txt new file mode 100644 index 0000000000..344204872c --- /dev/null +++ b/hal/board/zynqmp/README.txt @@ -0,0 +1,42 @@ +wolfBoot ZynqMP FSBL replacement - board PS init drop-in +======================================================== + +When wolfBoot replaces the Xilinx FSBL (ZYNQMP_FSBL=1), it must run the +board-specific PS initialization (PLLs, DDR controller + PHY training, MIO pin +mux, clocks) before any DDR/UART/QSPI/SD access. That init is generated by the +Xilinx tools from your hardware design (XSA) and is specific to your board and +DDR configuration. It carries the Xilinx copyright and is therefore NOT part of +the wolfBoot source tree. + +What to drop here +----------------- +Copy your generated files into this directory: + + hal/board/zynqmp/psu_init_gpl.c + hal/board/zynqmp/psu_init_gpl.h + +For the ZCU102 reference board these are produced by PetaLinux/Vitis, e.g.: + + /project-spec/hw-description/psu_init_gpl.c + /project-spec/hw-description/psu_init_gpl.h + +or from the FSBL app sources: + + .../embeddedsw/lib/sw_apps/zynqmp_fsbl/misc/zcu102/psu_init_gpl.c + +Use the GPL variant (psu_init_gpl.*). Do not edit it. + +How it builds +------------- +The unmodified Xilinx file includes and . wolfBoot supplies +minimal, wolfSSL-owned shims for those at hal/zynqmp/ (added to the include +path automatically). The Xilinx file defines the top-level psu_init(), which +hal/zynq.c calls at the start of hal_init(). + +Build (default location is this directory): + + make ZYNQMP_FSBL=1 + +Or point at the files elsewhere: + + make ZYNQMP_FSBL=1 ZYNQMP_PSU_INIT_DIR=/path/to/hw-description diff --git a/hal/board/zynqmp/pm_cfg_obj.c b/hal/board/zynqmp/pm_cfg_obj.c new file mode 100644 index 0000000000..6913a3511a --- /dev/null +++ b/hal/board/zynqmp/pm_cfg_obj.c @@ -0,0 +1,105 @@ +/* ZynqMP PMU firmware configuration object -- design-specific data. + * + * The PMU firmware needs this EEMI permission table to grant the APU access + * to the SoC power/clock/reset/peripheral nodes. On a stock boot the Xilinx + * FSBL loads it into the PMU via the PM_SET_CONFIGURATION IPI; wolfBoot (as + * the FSBL replacement) does the same -- see zynqmp_pm_set_configuration() in + * hal/zynq.c. Without it the PMU default-denies every node request and Linux + * peripheral drivers fail probe with -EACCES. + * + * This is numeric configuration data generated by the Xilinx tools from this + * hardware design (xczu9eg / ZCU102) -- the same board-specific role as + * psu_init_gpl.c, supplied from ZYNQMP_PSU_INIT_DIR and not part of the core + * wolfBoot tree. Regenerate per design by extracting the .sys_cfg_data + * section (symbol XPm_ConfigObject) from that design zynqmp_fsbl.elf: + * aarch64-none-elf-objcopy -O binary --only-section=.sys_cfg_data + * zynqmp_fsbl.elf pm_cfg_obj.bin + * then emit the words below (little-endian u32). + */ +#include + +const uint32_t pm_cfg_obj[] = { + 0x00000002, 0x00000008, 0x00000001, 0x00000101, 0x00000003, 0x00000001, + 0x00000001, 0xFFFFFFFF, 0x00000300, 0x00000300, 0x00000007, 0x00000100, + 0xFFFFFFFF, 0x00000201, 0x00000201, 0x00000008, 0x00000200, 0xFFFFFFFF, + 0x00000101, 0x00000101, 0x00000102, 0x00000028, 0x0000000B, 0x00000001, + 0x00000301, 0x0000000C, 0x00000001, 0x00000301, 0x0000000D, 0x00000001, + 0x00000301, 0x0000000E, 0x00000001, 0x00000301, 0x0000000F, 0x00000001, + 0x00000101, 0x00000010, 0x00000001, 0x00000101, 0x00000011, 0x00000001, + 0x00000201, 0x00000012, 0x00000001, 0x00000201, 0x00000013, 0x00000001, + 0x00000301, 0x00000014, 0x00000001, 0x00000301, 0x00000015, 0x00000001, + 0x00000301, 0x00000016, 0x00000001, 0x00000301, 0x00000018, 0x00000001, + 0x00000301, 0x00000019, 0x00000001, 0x00000301, 0x0000001A, 0x00000001, + 0x00000301, 0x0000001B, 0x00000001, 0x00000301, 0x0000001C, 0x00000001, + 0x00000301, 0x00000020, 0x00000001, 0x00000301, 0x00000021, 0x00000001, + 0x00000301, 0x00000022, 0x00000001, 0x00000301, 0x00000023, 0x00000001, + 0x00000301, 0x00000024, 0x00000001, 0x00000301, 0x00000025, 0x00000001, + 0x00000301, 0x00000026, 0x00000001, 0x00000301, 0x00000028, 0x00000001, + 0x00000301, 0x00000029, 0x00000001, 0x00000301, 0x0000002A, 0x00000001, + 0x00000301, 0x0000002B, 0x00000001, 0x00000301, 0x0000002D, 0x00000001, + 0x00000301, 0x0000002E, 0x00000001, 0x00000301, 0x00000030, 0x00000001, + 0x00000301, 0x00000031, 0x00000001, 0x00000301, 0x00000037, 0x00000001, + 0x00000301, 0x00000038, 0x00000000, 0x00000001, 0x00000039, 0x00000000, + 0x00000100, 0x00000040, 0x00000000, 0x00000200, 0x0000003A, 0x00000001, + 0x00000301, 0x0000003B, 0x00000001, 0x00000301, 0x0000003D, 0x00000001, + 0x00000301, 0x00000045, 0x00000001, 0x00000301, 0x00000103, 0x00000003, + 0x00000001, 0x0000000C, 0x00000037, 0x00000002, 0x00000003, 0x00000003, + 0x00000013, 0x00000002, 0x00000003, 0x00000003, 0x0000000B, 0x00000002, + 0x00000003, 0x00000003, 0x0000000C, 0x00000002, 0x00000003, 0x00000003, + 0x0000000D, 0x00000002, 0x00000003, 0x00000003, 0x0000000E, 0x00000002, + 0x00000003, 0x00000003, 0x00000025, 0x00000002, 0x00000003, 0x00000003, + 0x00000026, 0x00000002, 0x00000003, 0x00000003, 0x00000028, 0x00000002, + 0x00000003, 0x00000003, 0x0000002D, 0x00000002, 0x00000003, 0x00000003, + 0x00000045, 0x00000002, 0x00000003, 0x00000003, 0x00000038, 0x00000002, + 0x00000003, 0x00000003, 0x00000100, 0x00000003, 0x0000000F, 0x00000002, + 0x00000003, 0x00000003, 0x00000010, 0x00000002, 0x00000003, 0x00000003, + 0x00000039, 0x00000002, 0x00000003, 0x00000003, 0x00000200, 0x00000003, + 0x00000011, 0x00000002, 0x00000003, 0x00000003, 0x00000012, 0x00000002, + 0x00000003, 0x00000003, 0x00000040, 0x00000002, 0x00000003, 0x00000003, + 0x00000104, 0x00000004, 0x00000001, 0x00000300, 0x00000006, 0x00000301, + 0x0000000A, 0x00000300, 0x00000009, 0x00000301, 0x00000105, 0x00000078, + 0x000003E8, 0x00000301, 0x000003E9, 0x00000301, 0x000003EA, 0x00000301, + 0x000003EB, 0x00000301, 0x000003EC, 0x00000301, 0x000003ED, 0x00000301, + 0x000003EE, 0x00000301, 0x000003EF, 0x00000301, 0x000003F0, 0x00000301, + 0x000003F1, 0x00000301, 0x000003F2, 0x00000301, 0x000003F3, 0x00000301, + 0x000003F4, 0x00000301, 0x000003F5, 0x00000301, 0x000003F6, 0x00000301, + 0x000003F7, 0x00000301, 0x000003F8, 0x00000301, 0x000003F9, 0x00000301, + 0x000003FA, 0x00000301, 0x000003FB, 0x00000301, 0x000003FC, 0x00000301, + 0x000003FD, 0x00000301, 0x000003FE, 0x00000301, 0x000003FF, 0x00000301, + 0x00000400, 0x00000301, 0x00000401, 0x00000301, 0x00000402, 0x00000301, + 0x00000403, 0x00000301, 0x00000404, 0x00000301, 0x00000405, 0x00000301, + 0x00000406, 0x00000301, 0x00000407, 0x00000301, 0x00000408, 0x00000301, + 0x00000409, 0x00000301, 0x0000040A, 0x00000301, 0x0000040B, 0x00000301, + 0x0000040C, 0x00000301, 0x0000040D, 0x00000301, 0x0000040E, 0x00000301, + 0x0000040F, 0x00000301, 0x00000410, 0x00000301, 0x00000411, 0x00000301, + 0x00000412, 0x00000301, 0x00000413, 0x00000301, 0x00000414, 0x00000301, + 0x00000415, 0x00000301, 0x00000416, 0x00000301, 0x00000417, 0x00000301, + 0x00000418, 0x00000301, 0x00000419, 0x00000301, 0x0000041A, 0x00000301, + 0x0000041B, 0x00000301, 0x0000041C, 0x00000301, 0x0000041D, 0x00000301, + 0x0000041E, 0x00000301, 0x0000041F, 0x00000301, 0x00000420, 0x00000301, + 0x00000421, 0x00000301, 0x00000422, 0x00000301, 0x00000423, 0x00000301, + 0x00000424, 0x00000301, 0x00000425, 0x00000301, 0x00000426, 0x00000301, + 0x00000427, 0x00000301, 0x00000428, 0x00000301, 0x00000429, 0x00000301, + 0x0000042A, 0x00000301, 0x0000042B, 0x00000301, 0x0000042C, 0x00000000, + 0x0000042D, 0x00000301, 0x0000042E, 0x00000301, 0x0000042F, 0x00000100, + 0x00000430, 0x00000301, 0x00000431, 0x00000301, 0x00000432, 0x00000301, + 0x00000433, 0x00000301, 0x00000434, 0x00000301, 0x00000435, 0x00000301, + 0x00000436, 0x00000301, 0x00000437, 0x00000301, 0x00000438, 0x00000301, + 0x00000439, 0x00000301, 0x0000043A, 0x00000301, 0x0000043B, 0x00000301, + 0x0000043C, 0x00000301, 0x0000043D, 0x00000301, 0x0000043E, 0x00000301, + 0x0000043F, 0x00000301, 0x00000440, 0x00000301, 0x00000441, 0x00000301, + 0x00000442, 0x00000301, 0x00000443, 0x00000301, 0x00000444, 0x00000301, + 0x00000445, 0x00000301, 0x00000446, 0x00000301, 0x00000447, 0x00000301, + 0x00000448, 0x00000301, 0x00000449, 0x00000301, 0x0000044A, 0x00000301, + 0x0000044B, 0x00000301, 0x0000044C, 0x00000301, 0x0000044D, 0x00000301, + 0x0000044E, 0x00000301, 0x0000044F, 0x00000301, 0x00000450, 0x00000301, + 0x00000451, 0x00000301, 0x00000452, 0x00000301, 0x00000453, 0x00000301, + 0x00000454, 0x00000301, 0x00000455, 0x00000301, 0x00000456, 0x00000301, + 0x00000457, 0x00000301, 0x00000458, 0x00000301, 0x00000459, 0x00000301, + 0x0000045A, 0x00000301, 0x0000045B, 0x00000301, 0x0000045C, 0x00000301, + 0x0000045D, 0x00000301, 0x0000045E, 0x00000301, 0x0000045F, 0x00000301, + 0x00000107, 0x00000000, 0x00000000, 0x00000106, 0x00000301, 0x00000108, + 0x00003C04, +}; +const uint32_t pm_cfg_obj_words = + (uint32_t)(sizeof(pm_cfg_obj) / sizeof(pm_cfg_obj[0])); diff --git a/hal/zynq.c b/hal/zynq.c index 2097f43e7e..13940d54e6 100644 --- a/hal/zynq.c +++ b/hal/zynq.c @@ -86,8 +86,12 @@ typedef struct QspiDev { } QspiDev_t; static QspiDev_t mDev; +#ifndef WOLFBOOT_ZYNQMP_FSBL +/* PMU firmware version, queried over SMC to ARM-TF. Not available when wolfBoot + * is itself the FSBL running at EL3 (no ATF below). */ static uint32_t pmuVer; #define PMUFW_MIN_VER 0x10001 /* v1.1*/ +#endif /* forward declarations */ static int qspi_wait_ready(QspiDev_t* dev); @@ -265,6 +269,102 @@ uint32_t pmu_get_version(void) return ret_payload[1]; } +#if defined(WOLFBOOT_ZYNQMP_FSBL) && defined(WOLFBOOT_ZYNQMP_PM_CFG) +/* Direct APU->PMU IPI transport for loading the PMU configuration object. + * + * pmu_request() above talks to the PMU over an SMC to the ARM Trusted Firmware + * SIP service. When wolfBoot is itself the FSBL running at EL3 there is no ATF + * below it, so the SMC path is unavailable and we must reach the PMU firmware + * directly over the APU->PMU IPI channel, mirroring what the Xilinx FSBL does + * via XPm_SetConfiguration(). See hal/board/zynqmp/pm_cfg_obj.c for the data. */ +#define PMU_GLOBAL_CNTRL_REG 0xFFD80000UL +#define PMU_GLOBAL_FW_IS_PRESENT (1UL << 4) /* GLOBAL_CNTRL[FW_IS_PRESENT] */ + +/* APU IPI block (base 0xFF300000): TRIG at +0x00, OBS at +0x04. The PMU + * channel-0 request is bit 16 in the APU trigger/observation registers. */ +#define IPI_APU_TRIG_REG 0xFF300000UL +#define IPI_APU_OBS_REG 0xFF300004UL +#define IPI_PMU_CH0_MASK 0x00010000UL + +/* IPI message RAM (base 0xFF990000). APU(buffer index 2) -> PMU(index 7): + * request = base + 2*0x200 + 7*0x40 = 0xFF9905C0 + * response = base + 2*0x200 + 7*0x40 + 0x20 = 0xFF9905E0 */ +#define IPI_APU_TO_PMU_REQ_BUF 0xFF9905C0UL +#define IPI_PMU_TO_APU_RESP_BUF 0xFF9905E0UL + +/* PM API id (payload word 0) for loading the configuration object. */ +#define PM_SET_CONFIGURATION 0x02U + +#define PM_IPI_POLL_MAX 10000000UL + +extern const uint32_t pm_cfg_obj[]; +extern const uint32_t pm_cfg_obj_words; + +/* Send the PMU configuration object to the PMU firmware so it programs its + * EEMI access-control table (which masters may control which nodes). Returns + * the PMU status word (0 == success) or -1 on transport failure. */ +int zynqmp_pm_set_configuration(void) +{ + volatile uint32_t* fw_present = (volatile uint32_t*)PMU_GLOBAL_CNTRL_REG; + volatile uint32_t* req = (volatile uint32_t*)IPI_APU_TO_PMU_REQ_BUF; + volatile uint32_t* resp = (volatile uint32_t*)IPI_PMU_TO_APU_RESP_BUF; + volatile uint32_t* trig = (volatile uint32_t*)IPI_APU_TRIG_REG; + volatile uint32_t* obs = (volatile uint32_t*)IPI_APU_OBS_REG; + uint32_t status; + uint32_t timeout; + + /* PMU firmware must be running to accept the object. */ + if ((*fw_present & PMU_GLOBAL_FW_IS_PRESENT) == 0) { + wolfBoot_printf("PM config: PMUFW not present, skipping\n"); + return -1; + } + + /* The PMU reads the object from memory by address; make sure our copy is + * visible at the point of coherency before handing over the pointer. */ + flush_dcache_range((unsigned long)&pm_cfg_obj[0], + (unsigned long)&pm_cfg_obj[0] + sizeof(pm_cfg_obj[0]) * pm_cfg_obj_words); + + /* Wait for the PMU channel to be free (observation bit clear). */ + timeout = PM_IPI_POLL_MAX; + while ((*obs & IPI_PMU_CH0_MASK) != 0) { + if (--timeout == 0) { + wolfBoot_printf("PM config: IPI channel busy\n"); + return -1; + } + } + + /* Build the request: [PM_SET_CONFIGURATION, cfg-object address]. The + * object lives in OCM (< 4GB) so the 32-bit truncation is exact. */ + req[0] = PM_SET_CONFIGURATION; + req[1] = (uint32_t)(uintptr_t)&pm_cfg_obj[0]; + req[2] = 0; + req[3] = 0; + req[4] = 0; + req[5] = 0; + req[6] = 0; + __asm__ volatile("dsb sy" ::: "memory"); + + /* Trigger the IPI to PMU channel 0. */ + *trig = IPI_PMU_CH0_MASK; + __asm__ volatile("dsb sy" ::: "memory"); + + /* Wait for the PMU to acknowledge (clears the observation bit). */ + timeout = PM_IPI_POLL_MAX; + while ((*obs & IPI_PMU_CH0_MASK) != 0) { + if (--timeout == 0) { + wolfBoot_printf("PM config: no PMUFW ack\n"); + return -1; + } + } + + /* Response word 0 holds the PMU status (0 == XST_SUCCESS). */ + status = resp[0]; + wolfBoot_printf("PM config: PMUFW status 0x%x (%d words)\n", + (unsigned int)status, (int)pm_cfg_obj_words); + return (int)status; +} +#endif /* WOLFBOOT_ZYNQMP_FSBL && WOLFBOOT_ZYNQMP_PM_CFG */ + /* Aligned data buffer for DMA */ #define EFUSE_MAX_BUFSZ (sizeof(pmu_efuse) + 48 /* SHA3-384 Digest */) static uint8_t XALIGNED(32) efuseBuf[EFUSE_MAX_BUFSZ]; @@ -862,6 +962,22 @@ static inline int qspi_dmaisr_wait(uint32_t wait_mask, uint32_t wait_val) } return 0; } + +/* INVALIDATE-only D-cache maintenance (dc ivac) for a DMA-read destination -- + * the correct post-DMA-read operation (matches Xil_DCacheInvalidateRange / + * dma_unmap DMA_FROM_DEVICE). flush_dcache_range() is dc CIVAC (clean + + * invalidate); its CLEAN step can write a speculatively-filled cache line back + * to DDR AFTER the DMA, clobbering the freshly-DMA'd data. Use this instead for + * the post-DMA step so nothing is ever written back over the DMA result. */ +static inline void qspi_dcache_inval(unsigned long start, unsigned long end) +{ + unsigned long a; + __asm__ volatile("dsb sy" ::: "memory"); + for (a = (start & ~63UL); a < end; a += 64UL) { + __asm__ volatile("dc ivac, %0" : : "r"(a) : "memory"); + } + __asm__ volatile("dsb sy" ::: "memory"); +} #endif static int qspi_gen_fifo_write(uint32_t reg_genfifo) @@ -995,6 +1111,15 @@ static uint32_t qspi_calc_exp(uint32_t xferSz, uint32_t* reg_genfifo) #ifndef GQSPI_MODE_IO static uint8_t XALIGNED(QQSPI_DMA_ALIGN) dmatmp[GQSPI_DMA_TMPSZ]; #endif +#ifdef ZYNQMP_QSPI_OCM_BOUNCE +/* Fixed OCM scratch for the per-chunk DMA->OCM->memcpy body-load path. Sized + * to the max ext_flash_read chunk (4KB). Non-static: referenced by extern. */ +uint8_t XALIGNED(64) zynq_ocm_bounce[4096]; +#if defined(DEBUG_ZYNQ) && defined(ZYNQMP_QSPI_BOUNCE_PROGRESS) +static uint32_t bnc_qa[12]; +static uint32_t bnc_rd0[12]; +#endif +#endif static int qspi_transfer(QspiDev_t* pDev, const uint8_t* cmdData, uint32_t cmdSz, @@ -1008,6 +1133,21 @@ static int qspi_transfer(QspiDev_t* pDev, uint8_t* dmarxptr = NULL; #endif GQSPI_EN = 1; /* Enable device */ +#ifdef ZYNQMP_QSPI_FIFO_RESET + /* Reset the generic / TX / RX FIFOs at the start of every transfer so no + * stale or prefetched RX data carries over from the previous (adjacent) + * read. wolfBoot otherwise never resets these (only GQSPI_EN toggles); the + * Xilinx driver does. Carryover across back-to-back contiguous reads is the + * prime suspect for the +16KB body-load shift (isolated/sparse reads are + * correct, only adjacent sequential reads drift). Bits are self-clearing. */ + GQSPI_FIFO_CTRL = (GQSPI_FIFO_CTRL_RST_GEN_FIFO | + GQSPI_FIFO_CTRL_RST_TX_FIFO | + GQSPI_FIFO_CTRL_RST_RX_FIFO); +#endif +#ifdef ZYNQMP_QSPI_MANUAL_START + /* Manual start: hold execution off until the whole message is queued. */ + GQSPI_CFG &= ~GQSPI_CFG_START_GEN_FIFO; +#endif qspi_cs(pDev, 1); /* Select slave */ /* Setup bus slave selection */ @@ -1078,6 +1218,108 @@ static int qspi_transfer(QspiDev_t* pDev, } /* RX Data */ +#ifndef GQSPI_MODE_IO + /* Single-DMA RX path: arm ONE DMA for the entire destination buffer, then + * issue as many gen-FIFO RX (EXP) entries as needed to clock all the bytes. + * Every gen-FIFO entry feeds the SAME ongoing DMA, so the SPI read streams + * continuously across entries -- matching the Xilinx xqspipsu driver. The + * previous code stopped and re-armed the DMA (new GQSPIDMA_DST/SIZE) for + * each chunk; re-arming mid-stream desynced the read so all but one chunk + * landed as zeros (a delay between chunks hid it by letting the FIFO drain). + * Requires a DMA-aligned dst and a 4-byte-multiple size; small unaligned or + * odd-length reads fall back to the bounce buffer below. */ + if (ret == GQSPI_CODE_SUCCESS && rxData && rxSz > 0 && + (((size_t)rxData & (QQSPI_DMA_ALIGN-1)) == 0) && ((rxSz & 3) == 0)) { + uint32_t rxbase; + uint32_t tc, expo, imm; + + rxbase = reg_genfifo & ~(GQSPI_GEN_FIFO_TX | GQSPI_GEN_FIFO_IMM_MASK | + GQSPI_GEN_FIFO_EXP_MASK); + rxbase |= (GQSPI_GEN_FIFO_RX | GQSPI_GEN_FIFO_DATA_XFER); + rxbase |= (pDev->stripe & GQSPI_GEN_FIFO_STRIPE); + + /* Arm the DMA once for the whole transfer. */ + GQSPIDMA_ISR = GQSPIDMA_ISR_DONE; + (void)GQSPIDMA_ISR; + GQSPIDMA_DST = ((uintptr_t)rxData & 0xFFFFFFFF); + GQSPIDMA_DST_MSB = (((uintptr_t)rxData >> 32) & 0xFFF); + GQSPIDMA_SIZE = rxSz; + GQSPIDMA_IER = GQSPIDMA_ISR_DONE; +#ifndef ZYNQMP_QSPI_COHERENT + flush_dcache_range((unsigned long)rxData, + (unsigned long)rxData + rxSz); +#endif + + /* Issue gen-FIFO RX entries to clock all rxSz bytes, EXACTLY as the + * Xilinx xqspipsu driver does: one EXP entry per set bit of the byte + * count, in ASCENDING exponent order (2^8, 2^9, ...), then a final IMM + * entry for the low-byte remainder. The previous code emitted the + * largest power of two FIRST (a single 2^17 = 128KB EXP entry up front), + * which made the first 128KB stream in as zeros and desynced the rest. + * qspi_gen_fifo_write blocks on GEN_FIFO_NOT_FULL; the controller + * (auto-start, DMA mode) processes the entries into the one DMA. */ + tc = rxSz; + expo = 8; /* 2^8 = 256, smallest EXP unit */ + imm = rxSz & 0xFFU; /* low-byte remainder -> IMM entry */ + while (tc != 0 && ret == GQSPI_CODE_SUCCESS) { + if (tc & 0x100U) { /* bit 'expo' of the original byte count */ + ret = qspi_gen_fifo_write(rxbase | GQSPI_GEN_FIFO_EXP_MASK | + GQSPI_GEN_FIFO_IMM(expo)); + } + tc >>= 1; + expo++; + } + if (ret == GQSPI_CODE_SUCCESS && imm != 0) { + ret = qspi_gen_fifo_write((rxbase & ~GQSPI_GEN_FIFO_EXP_MASK) | + GQSPI_GEN_FIFO_IMM(imm)); + } + + if (ret == GQSPI_CODE_SUCCESS) { +#ifdef ZYNQMP_QSPI_MANUAL_START + /* whole message (cmd+addr+dummy+RX) queued + DMA armed -> trigger */ + GQSPI_CFG |= GQSPI_CFG_START_GEN_FIFO; +#endif + if (qspi_dmaisr_wait(GQSPIDMA_ISR_DONE, 0)) + return GQSPI_CODE_TIMEOUT; + if (qspi_isr_wait(GQSPI_IXR_GEN_FIFO_EMPTY, 0)) + return GQSPI_CODE_TIMEOUT; + GQSPIDMA_ISR = GQSPIDMA_ISR_DONE; + (void)GQSPIDMA_ISR; + GQSPIDMA_STS = GQSPIDMA_STS | GQSPIDMA_STS_WTC; /* clear WTC (W1C) */ +#ifndef ZYNQMP_QSPI_COHERENT + /* DMA is not cache-coherent: INVALIDATE (not clean+invalidate) the + * destination so the CPU reads the freshly DMA'd data and nothing + * is written back over it. */ + qspi_dcache_inval((unsigned long)rxData, + (unsigned long)rxData + rxSz); +#else + /* Coherent DMA (CCI): a barrier suffices; no maintenance. */ + __asm__ volatile("dsb sy" ::: "memory"); +#endif +#if defined(DEBUG_ZYNQ) + /* Ground truth for the first body chunk: GQSPIDMA register state + + * the word the DMA just wrote, read RIGHT HERE (before anything + * else can touch it). w0 correct here but 0 later => clobber; w0=0 + * here => the DMA itself wrote wrong. */ + if ((uintptr_t)rxData >= 0x10000000UL && + (uintptr_t)rxData < 0x10018000UL) { + /* Print the programmed DMA dst, the word this chunk wrote, AND + * the base word (0x10000000) every chunk -- when base0 flips + * from 464C457F to 0 we catch the chunk whose DMA clobbered the + * base. (Run with ZYNQMP_DCACHE_OFF_LOAD so these read DDR.) */ + wolfBoot_printf("DMAk: rx=%08x progDST=%08x w0=%08x base0=%08x\n", + (uint32_t)(uintptr_t)rxData, (uint32_t)GQSPIDMA_DST, + *(volatile uint32_t*)rxData, + *(volatile uint32_t*)0x10000000UL); + } +#endif + } + rxSz = 0; + } +#endif + + /* Bounce / I/O fallback: small unaligned or odd-length reads (e.g. flash ID + * and status). Per-entry transfer through the aligned bounce buffer. */ while (ret == GQSPI_CODE_SUCCESS && rxData && rxSz > 0) { /* Enable RX */ reg_genfifo &= ~(GQSPI_GEN_FIFO_TX | GQSPI_GEN_FIFO_IMM_MASK | @@ -1099,6 +1341,8 @@ static int qspi_transfer(QspiDev_t* pDev, xferSz = qspi_calc_exp(xferSz, ®_genfifo); } + GQSPIDMA_ISR = GQSPIDMA_ISR_DONE; + (void)GQSPIDMA_ISR; GQSPIDMA_DST = ((uintptr_t)dmarxptr & 0xFFFFFFFF); GQSPIDMA_DST_MSB = ((uintptr_t)dmarxptr >> 32); GQSPIDMA_SIZE = xferSz; @@ -1107,14 +1351,6 @@ static int qspi_transfer(QspiDev_t* pDev, (unsigned long)dmarxptr + xferSz); #endif -#if defined(DEBUG_ZYNQ) && DEBUG_ZYNQ >= 2 - #ifndef GQSPI_MODE_IO - wolfBoot_printf("DMA: ptr %p, xferSz %d\n", dmarxptr, xferSz); - #else - wolfBoot_printf("IO: ptr %p, xferSz %d\n", rxData, xferSz); - #endif -#endif - /* Submit general FIFO operation */ ret = qspi_gen_fifo_write(reg_genfifo); if (ret != GQSPI_CODE_SUCCESS) { @@ -1122,6 +1358,9 @@ static int qspi_transfer(QspiDev_t* pDev, break; } +#ifdef ZYNQMP_QSPI_MANUAL_START + GQSPI_CFG |= GQSPI_CFG_START_GEN_FIFO; /* trigger this entry */ +#endif #ifdef GQSPI_MODE_IO /* Read FIFO */ ret = gspi_fifo_rx(rxData, xferSz); @@ -1133,7 +1372,14 @@ static int qspi_transfer(QspiDev_t* pDev, if (qspi_dmaisr_wait(GQSPIDMA_ISR_DONE, 0)) { return GQSPI_CODE_TIMEOUT; } + if (qspi_isr_wait(GQSPI_IXR_GEN_FIFO_EMPTY, 0)) { + return GQSPI_CODE_TIMEOUT; + } GQSPIDMA_ISR = GQSPIDMA_ISR_DONE; /* clear DMA interrupt */ + (void)GQSPIDMA_ISR; /* read-back: force W1C to post */ + GQSPIDMA_STS = GQSPIDMA_STS | GQSPIDMA_STS_WTC; /* clear WTC (W1C) */ + qspi_dcache_inval((unsigned long)dmarxptr, + (unsigned long)dmarxptr + xferSz); /* adjust xfer sz */ if (xferSz > rxSz) xferSz = rxSz; @@ -1141,13 +1387,6 @@ static int qspi_transfer(QspiDev_t* pDev, if (dmarxptr != rxData) { memcpy(rxData, dmarxptr, xferSz); } - #if defined(DEBUG_ZYNQ) && DEBUG_ZYNQ >= 3 - if (xferSz <= 1024) { - for (uint32_t i=0; iDDR read comes + * back zeros" bug. Reads the SAME flash source (the signed boot image at the + * boot-partition base) to TWO different DDR destinations as the very first + * reads after init (no priming reads, immediate cache-invalidated dump). If + * the two destinations disagree the fault is destination-region dependent (a + * firewall/XMPU or DMA-addressing issue); if both show the same first-128KB + * zeros the fault is in the large-read mechanism (size/EXP/stripe) and is + * destination-independent; if both are correct the isolated read works and the + * fault is in the surrounding ramboot flow (sequencing). */ +static uint32_t qspi_diag_csum(volatile uint32_t* p, uint32_t bytes) +{ + uint32_t i, s = 0; + for (i = 0; i < bytes / 4; i++) { + s = (s + p[i]) & 0xFFFFFFFFUL; + } + return s; +} +void qspi_ddr_diag(void) +{ + /* Read two larger regions and checksum them (sum of u32 words), comparing + * against the values computed from the signed image file. Whether this runs + * in DMA or PIO mode is selected at BUILD time (GQSPI_MODE_IO); running it + * in PIO isolates whether the QSPIDMA is the corruptor: + * - 32KB @ flash 0x800400 (body start): exp word0=464c457f csum=77950825 + * - 16KB @ flash 0x818000 (dense ELF strtab): exp word0=2f1e0000 csum=0ae78d99 + * If PIO matches but DMA does not, the QSPIDMA write path is the fault. */ + /* Isolate bug #2 (data past the per-chip 64KB / combined 128KB boundary). + * Read 16KB regions BEFORE and AFTER the boundary via 64-byte FIFO-safe + * SEPARATE transfers (works in PIO and DMA). If the BEFORE region passes + * and the AFTER regions fail, the flash READ across the per-chip 64KB + * boundary is broken (addressing); if all pass, the flash read is fine and + * bug #2 is in the large body-load DMA/cache path, not the read. */ + static const struct { uint32_t foff; uint32_t exp; } regs[] = { + { 0x18000, 0x0ae78d99UL }, /* 96KB - before boundary */ + { 0x28000, 0x6d6e0fb6UL }, /* 160KB - past boundary */ + { 0x30000, 0xf7483d23UL } /* 192KB - past boundary */ + }; + int n = (int)(sizeof(regs)/sizeof(regs[0])); + volatile uint32_t* d = (volatile uint32_t*)0x20000000UL; + uint32_t off; + int i; + + wolfBoot_printf("QSPI DIAG: boundary test (mode=%s, 4KB reads)\n", +#ifdef GQSPI_MODE_IO + "PIO" +#else + "DMA" +#endif + ); + for (i = 0; i < n; i++) { + /* read each 16KB region as 4KB chunks (the size that works for the body + * load) to a FRESH small scratch -- fits in cache, isolates the FLASH + * READ across the per-chip 64KB boundary from the cumulative 225KB load. + * COHERENT build: read the scratch back from cache (no invalidate). */ + for (off = 0; off < 0x4000UL; off += 0x1000UL) { + (void)ext_flash_read( + WOLFBOOT_PARTITION_BOOT_ADDRESS + regs[i].foff + off, + (uint8_t*)d + off, 0x1000); + } + wolfBoot_printf(" flash+0x%05x w0=%08x csum=%08x exp=%08x %s\n", + regs[i].foff, (uint32_t)d[0], qspi_diag_csum(d, 0x4000), + regs[i].exp, + (qspi_diag_csum(d, 0x4000) == regs[i].exp) ? "OK" : "BAD"); + } +} +#endif /* ZYNQMP_QSPI_DIAG */ + void qspi_init(void) { int ret; @@ -1472,6 +1797,13 @@ void qspi_init(void) #ifdef GQSPI_MODE_IO reg_cfg = GQSPI_CFG_MODE_EN_IO; /* Use I/O Transfer Mode */ reg_cfg |= GQSPI_CFG_START_GEN_FIFO; /* Trigger GFIFO commands to start */ +#elif defined(ZYNQMP_QSPI_MANUAL_START) + /* MANUAL generic-FIFO start (matches Xilinx xqspipsu/Linux): queue the whole + * message's entries, then trigger execution as one batch. Auto-start lets + * the controller free-run as entries are written, which (with the DMA) + * desyncs large transfers. */ + reg_cfg = GQSPI_CFG_MODE_EN_DMA; + reg_cfg |= GQSPI_CFG_GEN_FIFO_START_MODE; /* manual start */ #else reg_cfg = GQSPI_CFG_MODE_EN_DMA; /* Use DMA Transfer Mode */ #endif @@ -1525,14 +1857,41 @@ void qspi_init(void) GQSPI_DATA_DLY_ADJ = 0; #endif +#if defined(DEBUG_ZYNQ) + /* Verify the QSPI clock + tap-delay assumptions: QSPI_REF_CTRL gives the + * actual ref-clock divisors (banner "Ref=125MHz" is only an assumption); + * read back the tap registers to confirm the EL3 writes actually landed. */ + wolfBoot_printf("QSPI clk: REF_CTRL=0x%x TAPDLY=0x%x LPBK=0x%x DATADLY=0x%x\n", + (uint32_t)QSPI_REF_CTRL, (uint32_t)IOU_TAPDLY_BYPASS, + (uint32_t)GQSPI_LPBK_DLY_ADJ, (uint32_t)GQSPI_DATA_DLY_ADJ); +#endif + /* Initialize hardware parameters for Threshold and Interrupts */ GQSPI_TX_THRESH = 1; GQSPI_RX_THRESH = 1; GQSPI_GF_THRESH = 31; - /* Reset DMA */ + /* Reset DMA. NOTE: write only DST_CTRL, exactly like the Xilinx baremetal + * (xqspipsu) FSBL driver and the Linux spi-zynqmp-gqspi driver -- BOTH of + * which read large images from this part reliably and NEITHER of which ever + * writes QSPIDMA_DST_CTRL2 (0x824). CTRL2 holds the AWCACHE bits and the + * RAM_EMASA/EMASB FIFO-RAM timing-margin bits; overriding the silicon's + * margins corrupts sustained DMA transfers while leaving small ones intact. + * wolfBoot previously wrote CTRL2=0x081BFFF8 here, which is the suspected + * root cause of the large-QSPI-read-to-DDR corruption. */ GQSPIDMA_CTRL = GQSPIDMA_CTRL_DEF; - GQSPIDMA_CTRL2 = GQSPIDMA_CTRL2_DEF; +#if defined(DEBUG_ZYNQ) + wolfBoot_printf("GQSPIDMA CTRL2 (left by boot): 0x%x\n", GQSPIDMA_CTRL2); +#endif +#ifdef ZYNQMP_QSPI_WRITE_CTRL2 + GQSPIDMA_CTRL2 = GQSPIDMA_CTRL2_DEF; /* legacy behaviour, opt-in */ +#endif + /* Clear the DMA Write-Transfer-Count (DST_STS.WTC, W1C). The Linux/Xilinx + * drivers do this; wolfBoot never did. WTC is a saturating count of issued + * DMA write transfers -- if it is not cleared it saturates after the first + * transfer and the QSPIDMA mis-handles every SUBSEQUENT transfer (the body + * load is many transfers, hence first-works/rest-corrupt). */ + GQSPIDMA_STS = GQSPIDMA_STS | GQSPIDMA_STS_WTC; GQSPIDMA_IER = GQSPIDMA_ISR_ALL_MASK; GQSPI_EN = 1; /* Enable Device */ @@ -1603,6 +1962,38 @@ void qspi_init(void) return; #endif +#ifdef ZYNQMP_QSPI_BOUNDARY_TEST + /* Per-chip 64KB flash-sector boundary discriminator. Isolated single + * 128-byte reads (one IMM gen-FIFO entry each, FIFO-safe) on both sides of + * combined flash 0x820000, where the per-chip 4-byte-address byte[2] rolls + * 0x40->0x41. The contiguous 225KB body load is byte-exact up to here and + * wrong past it. If these ISOLATED reads are correct PAST the boundary, the + * flash read addressing is fine and the corruption is the DMA/cache for + * contiguous later chunks; if wrong, it is a flash READ addressing bug. + * Expected words are from test-app/image_v1_signed.bin (the flashed body). */ + { + static const uint32_t bt_addr[6] = { + 0x800400, 0x810400, 0x818400, 0x820400, 0x828400, 0x830400 }; + static const uint32_t bt_exp[6] = { + 0x464C457F, 0x58000300, 0x83025201, + 0x5F544F4F, 0x28282828, 0x01009F30 }; + uint8_t XALIGNED(8) bt_buf[128]; + uint32_t bt_i, bt_got; + int bt_ret; + for (bt_i = 0; bt_i < 6; bt_i++) { + memset(bt_buf, 0xEE, sizeof(bt_buf)); + bt_ret = ext_flash_read((uintptr_t)bt_addr[bt_i], bt_buf, + (int)sizeof(bt_buf)); + bt_got = *(uint32_t*)bt_buf; + wolfBoot_printf("BTEST f%08x b2=%x: got %08x exp %08x %s ret=%d\n", + (unsigned)bt_addr[bt_i], + (unsigned)(((bt_addr[bt_i] / 2) >> 16) & 0xFF), + (unsigned)bt_got, (unsigned)bt_exp[bt_i], + (bt_got == bt_exp[bt_i]) ? "OK " : "BAD", bt_ret); + } + } +#endif + #ifdef TEST_EXT_FLASH test_ext_flash(&mDev); #endif @@ -1634,17 +2025,208 @@ uint64_t hal_timer_ms(void) return val; } +#ifdef WOLFBOOT_ZYNQMP_FSBL +/* wolfBoot's psu_init wrapper (hal/zynqmp_psu_shim.c): runs the board + * (XSA-generated) psu_init sub-stages but fixes the system timestamp counter + * to 100MHz after clock init and before DDR training, so the training settle + * delays are accurate. Returns 0 on success. */ +extern int zynqmp_psu_init(void); +/* Board file's complete psu_init() (the one the Xilinx FSBL uses). Selectable + * for A/B testing against our wrapper via -DZYNQMP_USE_BOARD_PSU_INIT. */ +extern int psu_init(void); +#endif + +#ifdef ZYNQMP_DDR_PROBE +/* Diagnostic: CONTIGUOUS CPU fill+verify of DDR (every word), mimicking the + * byte-granular access the firmware integrity hash does. A sparse walk can + * pass while scattered marginal bytes still hang a contiguous read, so this + * writes then reads back every word over a multi-MB region. Prints progress + * per MB; reports the first mismatch (a hang pins the PC in the loop). */ +#ifndef ZYNQMP_DDR_PROBE_BYTES +#define ZYNQMP_DDR_PROBE_BYTES (8UL * 1024UL * 1024UL) /* 8MB */ +#endif +/* Results published to OCM globals so JTAG can read them even if the UART + * stalls (e.g. interconnect congestion from write-backs to marginal DDR). */ +volatile uint32_t zynqmp_dbg_ddr_done = 0; +volatile uint32_t zynqmp_dbg_ddr_mismatches = 0; +volatile uint32_t zynqmp_dbg_ddr_firstbad = 0; +volatile uint32_t zynqmp_dbg_ddr_firstgot = 0; +static void zynqmp_ddr_probe(void) +{ + volatile uint32_t* base = (volatile uint32_t*)0x10000000UL; + uint32_t i, bad; + /* Incremental escalation to pinpoint a hang: single word, then growing + * power-of-two byte counts. Print before each phase so the UART shows the + * last size that completed. A single word isolates a fundamental CPU->DDR + * fault; a hang only at large sizes isolates a cache-eviction/capacity + * path issue. */ +#ifdef ZYNQMP_DDR_PROBE_NOCACHE + /* Bypass L1/L2 by clearing SCTLR_EL3.C so every access goes straight to + * DDR. Isolates the raw DDR/interconnect path from the cache-eviction/ + * linefill path. */ + { + unsigned long sctlr; + __asm__ volatile("dsb sy"); + __asm__ volatile("mrs %0, sctlr_el3" : "=r"(sctlr)); + sctlr &= ~(1UL << 2); /* clear C (data cache) */ + __asm__ volatile("msr sctlr_el3, %0; isb" : : "r"(sctlr)); + wolfBoot_printf("DDR probe: D-cache OFF\n"); + } +#endif + + /* Decisive read-vs-write isolation: write a region, then clean+invalidate + * the cache for it (forces all written data OUT to DDR and empties the + * cache), then read it back. After the flush, every read MUST be served + * from DDR (not cache). So: + * - if "flush" hangs -> the DDR WRITE-back path stalls + * - if "readback" hangs -> the DDR READ path stalls + * - if both complete with 0 bad -> CPU<->DDR fully works + * 256KB chosen to far exceed the 32KB L1 + spill L2. */ + { + const uint32_t WB = 256UL * 1024UL; /* bytes */ + uint32_t words = WB / (uint32_t)sizeof(uint32_t); + unsigned long a; + + wolfBoot_printf("DDR probe: WRITE %d KB...\n", (int)(WB / 1024)); + for (i = 0; i < words; i++) { + base[i] = 0xC0DE0000UL ^ i; + } + wolfBoot_printf("DDR probe: write loop returned\n"); + + /* clean+invalidate by VA to PoC over the region */ + wolfBoot_printf("DDR probe: FLUSH (dc civac)...\n"); + for (a = 0; a < WB; a += 64UL) { + __asm__ volatile("dc civac, %0" : : "r"((unsigned long)base + a) + : "memory"); + } + __asm__ volatile("dsb sy; isb"); + wolfBoot_printf("DDR probe: flush done (cache empty)\n"); + + bad = 0; + wolfBoot_printf("DDR probe: READBACK from DDR...\n"); + for (i = 0; i < words; i++) { + if (base[i] != (0xC0DE0000UL ^ i)) { + if (bad == 0) { + zynqmp_dbg_ddr_firstbad = (uint32_t)(uintptr_t)(base + i); + zynqmp_dbg_ddr_firstgot = base[i]; + } + bad++; + } + } + wolfBoot_printf("DDR probe: 256KB DDR r/w -> %s (%d bad)\n", + bad ? "FAIL" : "PASS", (int)bad); + } + zynqmp_dbg_ddr_done = 2; +} +#endif + /* public HAL functions */ void hal_init(void) { const char* bootMsg = "\nwolfBoot Secure Boot\n"; +#ifdef WOLFBOOT_ZYNQMP_FSBL + /* wolfBoot is the FSBL: bring up the PLLs, DDR, MIO mux and clocks before + * any DDR, UART, QSPI or SD access. Until this runs only the OCM (where + * wolfBoot executes) and the system counter are available. */ +#ifdef ZYNQMP_USE_BOARD_PSU_INIT + (void)psu_init(); /* board's complete psu_init (FSBL's) */ +#else + (void)zynqmp_psu_init(); /* our wrapper */ +#endif +#endif + #ifdef DEBUG_UART uart_init(); #endif wolfBoot_printf(bootMsg); wolfBoot_printf("Current EL: %d\n", current_el()); +#ifdef WOLFBOOT_ZYNQMP_FSBL + { + extern unsigned long zynqmp_dbg_cntfrq_boot, zynqmp_dbg_cntfrq_used; + extern unsigned int zynqmp_dbg_ts_ctrl, zynqmp_dbg_iopll_ctrl, + zynqmp_dbg_scntr_ctrl, zynqmp_dbg_scntr_freq; + wolfBoot_printf("Timer: CNTFRQ boot=%d used=%d\n", + (int)zynqmp_dbg_cntfrq_boot, (int)zynqmp_dbg_cntfrq_used); + wolfBoot_printf("Clk: TS_REF_CTRL=0x%x IOPLL_CTRL=0x%x " + "SCNTR_CTRL=0x%x SCNTR_FREQ=0x%x\n", + zynqmp_dbg_ts_ctrl, zynqmp_dbg_iopll_ctrl, + zynqmp_dbg_scntr_ctrl, zynqmp_dbg_scntr_freq); + } + /* DDR controller + PHY training status (MMIO reads, safe even if DDR + * memory itself is marginal). DDRC STAT[2:0] operating_mode: 1=normal. + * DDR PHY PGSR0 (0xFD080030): bit0 IDONE plus per-step done bits + * [11:1]=PL/DC/ZC/DI/WL/QSG/WLA/RD/WD/RE/WE; ERR bits [27:20]=ZC/WL/QSG/ + * WLA/RD/WD/RE/WE, [28] IVERR, [29] VERR. Nonzero ERR bits => cold-boot + * DDR PHY training FAILED. */ + { + volatile unsigned int* DDRC_STAT = (volatile unsigned int*)0xFD070004UL; + volatile unsigned int* PHY_PGSR0 = (volatile unsigned int*)0xFD080030UL; + volatile unsigned int* PHY_PGSR1 = (volatile unsigned int*)0xFD080034UL; + volatile unsigned int* PHY_PGCR0 = (volatile unsigned int*)0xFD080010UL; + unsigned int pgsr0 = *PHY_PGSR0; + wolfBoot_printf("DDR: STAT=0x%x PGSR0=0x%x PGSR1=0x%x PGCR0=0x%x\n", + *DDRC_STAT, pgsr0, *PHY_PGSR1, *PHY_PGCR0); + wolfBoot_printf("DDR: PHY train %s (ERR mask 0x%x)\n", + (pgsr0 & 0x3FF00000U) ? "FAIL" : "ok", + (pgsr0 & 0x3FF00000U)); + } + /* DDR clock chain: DPLL (CRF_APB 0xFD1A0000) drives the DDR clock. A wrong + * DPLL frequency or DDR_CTRL divisor mis-calibrates the read-DQS gate + * (round-trip delay in clock units) so reads fail while writes still work. + * PLL_STATUS bit1 = DPLL_LOCK. Also dump DDRC-derived read-path regs. */ + { + volatile unsigned int* DPLL_CTRL = (volatile unsigned int*)0xFD1A002CUL; + volatile unsigned int* DPLL_CFG = (volatile unsigned int*)0xFD1A0030UL; + volatile unsigned int* PLL_STAT = (volatile unsigned int*)0xFD1A0044UL; + volatile unsigned int* DDR_CTRL = (volatile unsigned int*)0xFD1A0080UL; + wolfBoot_printf("CLKMARK9 DPLL_CTRL=0x%x DPLL_CFG=0x%x PLL_STATUS=0x%x DDR_CTRL=0x%x\n", + *DPLL_CTRL, *DPLL_CFG, *PLL_STAT, *DDR_CTRL); + } + /* Per-lane DDR PHY read-path trained status/results (verbose, gated). + * DXnGSR0 @0xFD0807E0+n*0x100, GSR2 @+0x8, GTR0 @0xFD0807C0 (read-DQS gate + * system latency), LCDLR3 @+0x9C. For comparing against the stock FSBL. */ +#ifdef ZYNQMP_DDR_DEBUG + { + int lane; + for (lane = 0; lane < 9; lane++) { + unsigned long b = 0xFD080700UL + (unsigned long)lane * 0x100UL; + wolfBoot_printf("DX%d GSR0=0x%x GSR2=0x%x GTR0=0x%x LCDLR3=0x%x\n", + lane, + *(volatile unsigned int*)(b + 0xE0UL), /* GSR0 */ + *(volatile unsigned int*)(b + 0xE8UL), /* GSR2 */ + *(volatile unsigned int*)(b + 0xC0UL), /* GTR0 gate sys lat */ + *(volatile unsigned int*)(b + 0x9CUL)); /* LCDLR3 rd DQS dly */ + } + } +#endif /* ZYNQMP_DDR_DEBUG */ +#endif + +#ifdef ZYNQMP_ENABLE_CCI + /* Enable CCI-400 snoop + DVM message receipt on the APU ACE slave + * interfaces (S3/S4) so the A53 cluster's Inner-Shareable cacheable + * traffic completes coherently to DDR. ATF/BL31 normally does this; as the + * FSBL replacement wolfBoot must do it itself before any large cacheable + * CPU access to DDR (e.g. the integrity-check SHA). CCI-400 GPV base is + * 0xFD6E0000; SIn Snoop Control Register at 0x1000*(n+1); Status[0] is the + * change-pending bit. */ + { + volatile unsigned int* CCI_STAT = (volatile unsigned int*)0xFD6E0010UL; + volatile unsigned int* CCI_S3 = (volatile unsigned int*)0xFD6E4000UL; + volatile unsigned int* CCI_S4 = (volatile unsigned int*)0xFD6E5000UL; + *CCI_S3 = 0x00000003U; /* snoop enable | DVM enable */ + while ((*CCI_STAT & 0x1U) != 0U) { /* wait change complete */ } + *CCI_S4 = 0x00000003U; + while ((*CCI_STAT & 0x1U) != 0U) { } + wolfBoot_printf("CCI: snoop+DVM enabled on S3/S4\n"); + } +#endif + +#ifdef ZYNQMP_DDR_PROBE + zynqmp_ddr_probe(); +#endif + #ifndef WOLFBOOT_REPRODUCIBLE_BUILD wolfBoot_printf("Build: %s %s\n", __DATE__, __TIME__); #endif @@ -1653,6 +2235,16 @@ void hal_init(void) qspi_init(); #endif +#if defined(ZYNQMP_QSPI_DIAG) && ZYNQMP_QSPI_DIAG == 1 + qspi_ddr_diag(); +#endif + +#ifndef WOLFBOOT_ZYNQMP_FSBL + /* pmu_get_version()/csu_init() query the PMU firmware over an SMC to the + * EL3 SIP service provided by ARM Trusted Firmware. When wolfBoot itself + * is the FSBL running at EL3 there is no ATF below it, so these are skipped + * (the QSPI tap-delay path in qspi_init() already falls back to direct + * MMIO when current_el() > 2). */ pmuVer = pmu_get_version(); wolfBoot_printf("PMUFW Ver: %d.%d\n", (int)(pmuVer >> 16), (int)(pmuVer & 0xFFFF)); @@ -1665,6 +2257,14 @@ void hal_init(void) wolfBoot_printf("Skipping CSU Init (PMUFW not found)\n"); } #endif +#endif /* !WOLFBOOT_ZYNQMP_FSBL */ + +#if defined(WOLFBOOT_ZYNQMP_FSBL) && defined(WOLFBOOT_ZYNQMP_PM_CFG) + /* As the FSBL, hand the PMU firmware its configuration object so it grants + * the APU access to the SoC power/clock/reset/peripheral nodes. Without + * this the downstream Linux drivers fail probe with -EACCES. */ + (void)zynqmp_pm_set_configuration(); +#endif } void hal_prepare_boot(void) @@ -1850,36 +2450,134 @@ int RAMFUNCTION ext_flash_write(uintptr_t address, const uint8_t *data, int len) int RAMFUNCTION ext_flash_read(uintptr_t address, uint8_t *data, int len) { - int ret; + int ret = 0; uint8_t cmd[8]; /* size multiple of uint32_t */ - uint32_t idx = 0; + uint32_t idx; + uintptr_t qaddr; + int off = 0; #if defined(DEBUG_ZYNQ) && DEBUG_ZYNQ >= 2 wolfBoot_printf("Flash Read: Addr 0x%x, Ptr %p, Len %d\n", address, data, len); #endif - if (mDev.stripe) { - /* For dual parallel the address divide by 2 */ - address /= 2; - } + /* Issue the read as a sequence of SEPARATE, individually-addressed + * transfers, each sized to a single generic-FIFO RX entry: a power of two + * <= 4KB (one EXP entry) or the exact remainder when <= 255 (one IMM + * entry). The ZynqMP GQSPI controller corrupts reads whose data spans more + * than one RX gen-FIFO entry in a single continuous (one command+address) + * transfer -- the streamed bytes desync and come back as zeros/garbage, + * regardless of DMA chunking or settle delays. Re-issuing the read command + * with a fresh (incremented) address per chunk keeps every transfer to one + * entry, which is the only form this part reads reliably. A single small + * read (header, flash ID) already used one entry and always worked; this + * extends that reliable form to large reads (firmware body load). */ + while (off < len) { + int rem = len - off; + int chunk; + +#ifdef ZYNQMP_QSPI_READ_DELAY + hal_delay_ms(1); /* TEST: settle before each transfer */ +#endif +#ifdef ZYNQMP_QSPI_NOCHUNK + /* TEST: one DMA for the whole read (the FSBL approach), so the DMA + * destination is armed ONCE and increments continuously, instead of + * re-arming per 4KB chunk (which wraps the dst at 16KB on this part). */ + chunk = rem; +#else + if (rem <= 0xFF) { + chunk = rem; /* one IMM gen-FIFO entry */ + } + else { + chunk = 0x1000; /* one EXP entry, max 4KB */ + while (chunk > rem) + chunk >>= 1; /* largest power of two <= rem */ + } +#endif +#if defined(GQSPI_MODE_IO) && defined(ZYNQMP_QSPI_PIO_CHUNK) + /* PIO mode drains the RX FIFO by CPU. Cap each transfer to <= the + * controller RX FIFO depth so a single gen-FIFO entry can never + * overflow it (no backpressure dependence). 128 bytes uses an IMM + * entry and is half the 256-byte RX FIFO. */ + if (chunk > ZYNQMP_QSPI_PIO_CHUNK) + chunk = ZYNQMP_QSPI_PIO_CHUNK; +#endif - /* ------ Read Flash ------ */ - memset(cmd, 0, sizeof(cmd)); - cmd[idx++] = FLASH_READ_CMD; + qaddr = address + (uintptr_t)off; + if (mDev.stripe) { + /* For dual parallel the per-chip address is half the combined. */ + qaddr /= 2; + } + + idx = 0; + memset(cmd, 0, sizeof(cmd)); + cmd[idx++] = FLASH_READ_CMD; #if GQPI_USE_4BYTE_ADDR == 1 - cmd[idx++] = ((address >> 24) & 0xFF); + cmd[idx++] = ((qaddr >> 24) & 0xFF); +#endif + cmd[idx++] = ((qaddr >> 16) & 0xFF); + cmd[idx++] = ((qaddr >> 8) & 0xFF); + cmd[idx++] = ((qaddr >> 0) & 0xFF); +#ifdef ZYNQMP_QSPI_OCM_BOUNCE + /* DMA each chunk to a FIXED OCM scratch, then CPU-copy to the (DDR) + * destination. The GQSPI DMA writes to a non-incrementing OCM address + * reliably (it is how the BootROM loads wolfBoot); driving an + * incrementing DDR destination per chunk aliases/wraps. The CPU memcpy + * to DDR is naturally coherent with the CPU reads the verify does. */ + { + extern uint8_t XALIGNED(64) zynq_ocm_bounce[]; + ret = qspi_transfer(&mDev, cmd, idx, NULL, 0, zynq_ocm_bounce, + chunk, GQSPI_DUMMY_READ, mDev.mode); + if (ret == 0) { + memcpy(data + off, zynq_ocm_bounce, (size_t)chunk); + } +#if defined(DEBUG_ZYNQ) && defined(ZYNQMP_QSPI_BOUNCE_PROGRESS) + /* CAPTURE-ONLY (no printf in the loop -- a per-chunk printf + * disrupts the back-to-back QSPI timing and hangs the load). Store + * rd0/qa/dst at 0x2000 strides into arrays; dump them AFTER the + * whole load completes (below). rd0 = first word read into the + * bounce for THIS chunk (== src[off] if the read is right). */ + if ((len > 0x18000) && (off < 0x18000) && ((off & 0x1FFFU) == 0)) { + unsigned k = (unsigned)off >> 13; + if (k < 12) { + bnc_qa[k] = (uint32_t)qaddr; + bnc_rd0[k] = *(volatile uint32_t*)zynq_ocm_bounce; + } + } #endif - cmd[idx++] = ((address >> 16) & 0xFF); - cmd[idx++] = ((address >> 8) & 0xFF); - cmd[idx++] = ((address >> 0) & 0xFF); - ret = qspi_transfer(&mDev, cmd, idx, NULL, 0, data, len, GQSPI_DUMMY_READ, - mDev.mode); + } +#else + ret = qspi_transfer(&mDev, cmd, idx, NULL, 0, data + off, chunk, + GQSPI_DUMMY_READ, mDev.mode); +#endif + if (ret != 0) { #if defined(DEBUG_ZYNQ) && DEBUG_ZYNQ >= 2 - wolfBoot_printf("Flash Read: Ret %d\r\n", ret); + wolfBoot_printf("Flash Read: Ret %d at off %d\r\n", ret, off); #endif + return ret; + } + off += chunk; + } - return (ret == 0) ? len : ret; +#if defined(DEBUG_ZYNQ) && DEBUG_ZYNQ >= 2 + wolfBoot_printf("Flash Read: Ret %d\r\n", ret); +#endif +#if defined(DEBUG_ZYNQ) && defined(ZYNQMP_QSPI_OCM_BOUNCE) && \ + defined(ZYNQMP_QSPI_BOUNCE_PROGRESS) + /* Post-load dump (no in-loop printf). rd0 = word READ for the chunk at + * off; dst = word now in DDR at off. If rd0 drifts from src[off] the READ + * is wrong (controller); if rd0 is right but dst is shifted the dest path + * is wrong. */ + if (len > 0x18000) { + unsigned k; + for (k = 0; k < 12; k++) { + wolfBoot_printf(" BNC off=%05x qa=%08x rd0=%08x dst=%08x\n", + k << 13, (unsigned)bnc_qa[k], (unsigned)bnc_rd0[k], + (unsigned)*(volatile uint32_t*)(data + (k << 13))); + } + } +#endif + return len; } /* Issues a sector erase based on flash address */ diff --git a/hal/zynq.h b/hal/zynq.h index d732f5f315..ed8809a304 100644 --- a/hal/zynq.h +++ b/hal/zynq.h @@ -204,6 +204,14 @@ #define GQSPI_FIFO_WORD_SZ 4 #define QQSPI_DMA_ALIGN 64 /* L1 cache size */ +/* Maximum bytes per GQSPI DMA transfer. The controller does not reliably + * complete a single very large (>=128KB) DMA read into DDR on ZynqMP -- the + * large chunk lands as zeros -- so cap each transfer and let the flash stream + * continuously across multiple RX gen-FIFO entries. 32KB is well within the + * working range and keeps the one-time boot read fast. */ +#ifndef GQSPI_DMA_MAX_CHUNK +#define GQSPI_DMA_MAX_CHUNK 0x8000U +#endif #ifndef GQSPI_DMA_TMPSZ /* Use larger of WOLFBOOT_SHA_BLOCK_SIZE or IMAGE_HEADER_SIZE */ #if defined(WOLFBOOT_SHA_BLOCK_SIZE) && \ diff --git a/hal/zynqmp/sleep.h b/hal/zynqmp/sleep.h new file mode 100644 index 0000000000..0346a1e92d --- /dev/null +++ b/hal/zynqmp/sleep.h @@ -0,0 +1,34 @@ +/* sleep.h + * + * Copyright (C) 2025 wolfSSL Inc. + * + * This file is part of wolfBoot. + * + * wolfBoot is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfBoot is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +/* Minimal Xilinx-compatible shim for the board-supplied + * psu_init_gpl.c. Implemented in hal/zynqmp_psu_shim.c using the ARMv8 + * generic timer (CNTPCT/CNTFRQ), matching how the Xilinx FSBL provides + * usleep()/sleep() during early init. */ + +#ifndef WOLFBOOT_ZYNQMP_SLEEP_H +#define WOLFBOOT_ZYNQMP_SLEEP_H + +void usleep(unsigned long useconds); +unsigned int sleep(unsigned int seconds); + +#endif /* WOLFBOOT_ZYNQMP_SLEEP_H */ diff --git a/hal/zynqmp/xil_io.h b/hal/zynqmp/xil_io.h new file mode 100644 index 0000000000..c25dd02189 --- /dev/null +++ b/hal/zynqmp/xil_io.h @@ -0,0 +1,92 @@ +/* xil_io.h + * + * Copyright (C) 2025 wolfSSL Inc. + * + * This file is part of wolfBoot. + * + * wolfBoot is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfBoot is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +/* Minimal Xilinx-compatible MMIO/types shim. + * + * This provides just enough of the Xilinx BSP surface for an + * unmodified, board-generated psu_init_gpl.c to compile and link into + * wolfBoot when it runs as the ZynqMP FSBL replacement (WOLFBOOT_ZYNQMP_FSBL). + * The board's psu_init_gpl.c / psu_init_gpl.h are supplied at build time and + * are NOT part of the wolfBoot tree (they are board/XSA specific and carry the + * Xilinx copyright). See docs and config/examples/zynqmp_fsbl.config. + */ + +#ifndef WOLFBOOT_ZYNQMP_XIL_IO_H +#define WOLFBOOT_ZYNQMP_XIL_IO_H + +#include + +/* Xilinx fixed-width type aliases used by psu_init_gpl.c */ +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; +typedef uintptr_t UINTPTR; +typedef intptr_t INTPTR; + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +static inline u32 Xil_In32(UINTPTR addr) +{ + return *(volatile u32*)addr; +} + +static inline void Xil_Out32(UINTPTR addr, u32 value) +{ + *(volatile u32*)addr = value; +} + +static inline u16 Xil_In16(UINTPTR addr) +{ + return *(volatile u16*)addr; +} + +static inline void Xil_Out16(UINTPTR addr, u16 value) +{ + *(volatile u16*)addr = value; +} + +static inline u8 Xil_In8(UINTPTR addr) +{ + return *(volatile u8*)addr; +} + +/* Xilinx BSP logging used by a few psu_init error paths (e.g. SERDES cal + * timeout). psu_init runs before uart_init(), so this is a no-op; see + * hal/zynqmp_psu_shim.c. */ +int xil_printf(const char* ctrl1, ...); + +static inline void Xil_Out8(UINTPTR addr, u8 value) +{ + *(volatile u8*)addr = value; +} + +#endif /* WOLFBOOT_ZYNQMP_XIL_IO_H */ diff --git a/hal/zynqmp_atf.c b/hal/zynqmp_atf.c new file mode 100644 index 0000000000..a000964933 --- /dev/null +++ b/hal/zynqmp_atf.c @@ -0,0 +1,109 @@ +/* zynqmp_atf.c + * + * Copyright (C) 2025 wolfSSL Inc. + * + * This file is part of wolfBoot. + * + * wolfBoot is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfBoot is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +/* ARM Trusted Firmware (BL31) handoff for wolfBoot as the ZynqMP FSBL. + * + * The handoff-parameter format and the register used to publish it match the + * Xilinx FSBL (see the embeddedsw zynqmp_fsbl xfsbl_image_header.{c,h}): + * - A parameter block beginning with the ASCII magic "XLNX", an entry count, + * and up to 8 entries, each {u64 entry_point, u64 flags}. + * - The block's address is written to PMU_GLOBAL.GLOBAL_GEN_STORAGE6; BL31 + * reads it during early setup. + * + * For Milestone 1 wolfBoot publishes a single entry describing the next + * normal-world image (BL33). BL31 starts BL33 at the requested exception + * level; passing a device tree / arguments to BL33 is BL31's responsibility + * (the handoff block carries no argument fields). */ + +#include "zynqmp_atf.h" + +/* PMU_GLOBAL base 0xFFD80000. GLOBAL_GEN_STORAGE6 (offset 0x48) carries the + * ATF handoff-parameter block address (read by stock BL31). GLOBAL_GEN_STORAGE5 + * (offset 0x44) carries the BL33 device tree address for a direct-Linux BL33 + * (consumed by a small TF-A patch; see header). Both are device memory in + * wolfBoot's MMU, so the stores reach the registers without a cache flush. */ +#define PMU_GLOBAL_GLOB_GEN_STORAGE5 0xFFD80044UL +#define PMU_GLOBAL_GLOB_GEN_STORAGE6 0xFFD80048UL + +/* PartitionFlags bit layout in the ATF handoff parameters. */ +#define ZYNQMP_ATF_FLAG_EXEC_AA32 (1U << 0) /* 0 = AArch64 */ +#define ZYNQMP_ATF_FLAG_BIG_ENDIAN (1U << 1) /* 0 = little-endian */ +#define ZYNQMP_ATF_FLAG_SECURE (1U << 2) /* 0 = non-secure (normal) */ +#define ZYNQMP_ATF_FLAG_EL_SHIFT 3U /* bits [4:3] = target EL */ +#define ZYNQMP_ATF_FLAG_EL_MASK (3U << ZYNQMP_ATF_FLAG_EL_SHIFT) +#define ZYNQMP_ATF_FLAG_CPU_SHIFT 5U /* bits [6:5] = A53 core (0) */ + +#define ZYNQMP_ATF_MAX_ENTRIES 8 + +struct zynqmp_atf_entry { + uint64_t entry_point; + uint64_t flags; +}; + +struct zynqmp_atf_handoff_params { + char magic[4]; /* "XLNX" */ + uint32_t num_entries; + struct zynqmp_atf_entry entry[ZYNQMP_ATF_MAX_ENTRIES]; +}; + +/* Lives in OCM (wolfBoot's .bss). BL31 is loaded to DDR, so this block is + * untouched until BL31 reads it. */ +static struct zynqmp_atf_handoff_params atf_handoff; + +/* Defined in src/boot_aarch64_start.S (built under WOLFBOOT_ZYNQMP_FSBL). */ +extern void el3_to_atf_boot(uintptr_t bl31_entry); +extern void flush_dcache_range(uintptr_t start, uintptr_t end); + +void zynqmp_atf_handoff(uintptr_t bl31_entry, uintptr_t bl33_entry, + uintptr_t dts_addr, uint32_t bl33_el) +{ + volatile uint32_t* storage5 = + (volatile uint32_t*)PMU_GLOBAL_GLOB_GEN_STORAGE5; + volatile uint32_t* storage6 = + (volatile uint32_t*)PMU_GLOBAL_GLOB_GEN_STORAGE6; + uint64_t flags; + + /* Non-secure, AArch64, little-endian, A53-0, at the requested EL. */ + flags = ((uint64_t)(bl33_el << ZYNQMP_ATF_FLAG_EL_SHIFT)) + & ZYNQMP_ATF_FLAG_EL_MASK; + + atf_handoff.magic[0] = 'X'; + atf_handoff.magic[1] = 'L'; + atf_handoff.magic[2] = 'N'; + atf_handoff.magic[3] = 'X'; + atf_handoff.num_entries = 1; + atf_handoff.entry[0].entry_point = (uint64_t)bl33_entry; + atf_handoff.entry[0].flags = flags; + + /* Publish the parameter-block address where BL31 reads it, and the BL33 + * device tree address for the TF-A patch that forwards it to x0. */ + *storage6 = (uint32_t)(uintptr_t)&atf_handoff; + *storage5 = (uint32_t)dts_addr; + + /* Clean the parameter block to the PoC so BL31, entered with the MMU and + * caches off, observes it. */ + flush_dcache_range((uintptr_t)&atf_handoff, + (uintptr_t)&atf_handoff + sizeof(atf_handoff)); + + /* Tear down EL3 MMU/caches and branch to BL31. Does not return. */ + el3_to_atf_boot(bl31_entry); +} diff --git a/hal/zynqmp_atf.h b/hal/zynqmp_atf.h new file mode 100644 index 0000000000..d129e76771 --- /dev/null +++ b/hal/zynqmp_atf.h @@ -0,0 +1,55 @@ +/* zynqmp_atf.h + * + * Copyright (C) 2025 wolfSSL Inc. + * + * This file is part of wolfBoot. + * + * wolfBoot is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfBoot is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +/* ARM Trusted Firmware (BL31) handoff for wolfBoot running as the ZynqMP FSBL + * replacement (Milestone 1). wolfBoot loads + verifies BL31 and the next + * normal-world image (BL33: U-Boot or the Linux kernel) with its own keys, + * then hands off to BL31 at EL3. BL31 stays resident as the EL3 monitor and + * drops BL33 to the requested lower exception level. */ + +#ifndef ZYNQMP_ATF_H +#define ZYNQMP_ATF_H + +#include + +/* Target exception level for the BL33 (normal-world) image. */ +#define ZYNQMP_ATF_EL1 1U +#define ZYNQMP_ATF_EL2 2U + +/* Build the BL31 handoff parameters for the given normal-world (BL33) entry + * point and exception level, publish the parameter-block address to the + * PMU_GLOBAL scratch register that BL31 reads, and hand off to BL31 at EL3. + * Does not return. + * + * bl31_entry: EL3 entry point of the loaded BL31 image (from its ELF entry). + * bl33_entry: entry point of the next normal-world image BL31 will start. + * dts_addr: device tree address for BL33. The standard ATF handoff block has + * no argument fields and stock ZynqMP TF-A enters BL33 with x0=0, + * so for a direct (no U-Boot) Linux BL33 wolfBoot also publishes + * dts_addr in PMU_GLOBAL.GLOBAL_GEN_STORAGE5. A small TF-A patch + * must read that register into the BL33 entrypoint x0; pass 0 when + * BL33 finds its own DTB (e.g. U-Boot). See docs. + * bl33_el: ZYNQMP_ATF_EL1 or ZYNQMP_ATF_EL2. */ +void zynqmp_atf_handoff(uintptr_t bl31_entry, uintptr_t bl33_entry, + uintptr_t dts_addr, uint32_t bl33_el); + +#endif /* ZYNQMP_ATF_H */ diff --git a/hal/zynqmp_fsbl.its b/hal/zynqmp_fsbl.its new file mode 100644 index 0000000000..395061b17e --- /dev/null +++ b/hal/zynqmp_fsbl.its @@ -0,0 +1,77 @@ +/dts-v1/; + +/* + * Xilinx ZynqMP ZCU102 - wolfBoot-as-FSBL -> Linux boot FIT. + * + * wolfBoot (running as the FSBL replacement at EL3 in OCM) loads this FIT + * from QSPI, verifies the outer wolfBoot RSA4096/SHA3 signature, then: + * - gunzips the kernel to 0x00200000 (BL33 entry), + * - loads BL31 to 0x70000000 (found by the literal node name "atf"), + * - relocates the DTB to WOLFBOOT_LOAD_DTS_ADDRESS (0x11800000), + * - publishes DTB addr in PMU_GLOBAL_GEN_STORAGE5 and the XLNX handoff + * block addr in GEN_STORAGE6, and branches to BL31 at EL3. + * BL31 (xlnx_rebase_v2.12 + tf-a-zynqmp-wolfboot-dtb.patch) hands the kernel + * its DTB in x0 and enters BL33 at EL2. + * + * The incbin paths are absolute (artifacts live in the PetaLinux images tree, + * outside this repo). Regenerate bl31-ddr-0x70000000.bin per the plan if the + * BL31 link base changes; it must equal the "atf" load/entry below. + * + * Build: mkimage -f hal/zynqmp_fsbl.its fitImage + * Sign: ./tools/keytools/sign --rsa4096 --sha3 fitImage .der 1 + */ + +/ { + description = "Xilinx ZynqMP ZCU102 - wolfBoot Linux (FSBL)"; + #address-cells = <1>; + + images { + kernel-1 { + description = "Linux Kernel (gzip)"; + data = /incbin/("/home/davidgarske/Projects/Gilat/xilinx-zcu102-spi-2025.2/images/linux/Image.gz"); + type = "kernel"; + arch = "arm64"; + os = "linux"; + compression = "gzip"; + load = <0x00200000>; + entry = <0x00200000>; + hash-1 { + algo = "sha256"; + }; + }; + atf { + description = "ARM Trusted Firmware BL31 (DDR-linked)"; + data = /incbin/("/home/davidgarske/Projects/Gilat/xilinx-zcu102-spi-2025.2/images/linux/bl31-ddr-0x70000000.bin"); + type = "firmware"; + arch = "arm64"; + os = "arm-trusted-firmware"; + compression = "none"; + load = <0x70000000>; + entry = <0x70000000>; + hash-1 { + algo = "sha256"; + }; + }; + fdt-1 { + description = "Flattened Device Tree blob"; + data = /incbin/("/home/davidgarske/Projects/Gilat/xilinx-zcu102-spi-2025.2/images/linux/system.dtb"); + type = "flat_dt"; + arch = "arm64"; + compression = "none"; + hash-1 { + algo = "sha256"; + }; + }; + }; + configurations { + default = "conf-1"; + conf-1 { + description = "Linux kernel + FDT (BL31 loaded by literal name atf)"; + kernel = "kernel-1"; + fdt = "fdt-1"; + hash-1 { + algo = "sha256"; + }; + }; + }; +}; diff --git a/hal/zynqmp_ocm.ld b/hal/zynqmp_ocm.ld new file mode 100644 index 0000000000..d88bff678b --- /dev/null +++ b/hal/zynqmp_ocm.ld @@ -0,0 +1,336 @@ +/* Linker Script for Zynq UltraScale+ (ZynqMP) - wolfBoot as FSBL replacement. + * + * In this configuration the BootROM authenticates and loads wolfBoot (the + * [bootloader] partition) directly into the 256KB On-Chip Memory (OCM) at + * 0xFFFC0000 and enters it at EL3. wolfBoot then runs psu_init() to bring up + * the PLLs/DDR/MIO/clocks BEFORE any DDR access, so wolfBoot itself must link + * and run entirely from OCM (DDR is not available at entry). + * + * Budget (256KB OCM): wolfBoot is ~71KB (text+data+real-bss); the EL3 stack + * and heap reservations plus the board psu_init .text share the remainder. + * Heap is intentionally small here (SPMATH avoids large heap use); raise + * _HEAP_SIZE only if a config needs it. Once psu_init has brought up DDR, the + * downstream image staging/verify buffers live in DDR (WOLFBOOT_LOAD_ADDRESS), + * not in OCM. + */ + +/* EL3 stack 64KB for RSA 4096-bit verify; heap kept small to fit OCM. */ +_STACK_SIZE = DEFINED(_STACK_SIZE) ? _STACK_SIZE : 0x10000; +_HEAP_SIZE = DEFINED(_HEAP_SIZE) ? _HEAP_SIZE : 0x8000; + +_EL0_STACK_SIZE = DEFINED(_EL0_STACK_SIZE) ? _EL0_STACK_SIZE : 1024; +_EL1_STACK_SIZE = DEFINED(_EL1_STACK_SIZE) ? _EL1_STACK_SIZE : 2048; +_EL2_STACK_SIZE = DEFINED(_EL2_STACK_SIZE) ? _EL2_STACK_SIZE : 1024; + +/* Define Memories in the system */ +MEMORY +{ + /* wolfBoot links/runs from OCM (256KB) as the FSBL. The BootROM places it + * here; DDR is brought up later by psu_init() and is only used at runtime + * for downstream image staging (not for wolfBoot's own sections). */ + psu_ocm_ram_0_MEM_0 : ORIGIN = 0xFFFC0000, LENGTH = 0x40000 + psu_ddr_0_MEM_0 : ORIGIN = 0x0, LENGTH = 0x80000000 + psu_ddr_1_MEM_0 : ORIGIN = 0x800000000, LENGTH = 0x80000000 + psu_qspi_linear_0_MEM_0 : ORIGIN = 0xC0000000, LENGTH = 0x20000000 +} + +/* Specify the default entry point to the program */ +ENTRY(_vector_table) + +/* Define the sections, and where they are mapped in memory */ +SECTIONS +{ + + PROVIDE (_DDR_ADDRESS = 0x80001000); + PROVIDE (_OCRAM_ADDRESS = ORIGIN(psu_ocm_ram_0_MEM_0)); + PROVIDE (_MEMORY_SIZE = LENGTH(psu_ocm_ram_0_MEM_0)); + +.text : { + KEEP (*(.vectors)) + *(.boot) + *(.text) + *(.text.*) + *(.gnu.linkonce.t.*) + *(.plt) + *(.gnu_warning) + *(.gcc_except_table) + *(.glue_7) + *(.glue_7t) + *(.ARM.extab) + *(.gnu.linkonce.armextab.*) +} > psu_ocm_ram_0_MEM_0 + +.init (ALIGN(64)) : { + KEEP (*(.init)) +} > psu_ocm_ram_0_MEM_0 + +.fini (ALIGN(64)) : { + KEEP (*(.fini)) +} > psu_ocm_ram_0_MEM_0 + +.interp : { + KEEP (*(.interp)) +} > psu_ocm_ram_0_MEM_0 + +.note-ABI-tag : { + KEEP (*(.note-ABI-tag)) +} > psu_ocm_ram_0_MEM_0 + +.rodata : { + . = ALIGN(64); + __rodata_start = .; + *(.rodata) + *(.rodata.*) + *(.gnu.linkonce.r.*) + __rodata_end = .; +} > psu_ocm_ram_0_MEM_0 + +.rodata1 : { + . = ALIGN(64); + __rodata1_start = .; + *(.rodata1) + *(.rodata1.*) + __rodata1_end = .; +} > psu_ocm_ram_0_MEM_0 + +.sdata2 : { + . = ALIGN(64); + __sdata2_start = .; + *(.sdata2) + *(.sdata2.*) + *(.gnu.linkonce.s2.*) + __sdata2_end = .; +} > psu_ocm_ram_0_MEM_0 + +.sbss2 : { + . = ALIGN(64); + __sbss2_start = .; + *(.sbss2) + *(.sbss2.*) + *(.gnu.linkonce.sb2.*) + __sbss2_end = .; +} > psu_ocm_ram_0_MEM_0 + +.data : { + . = ALIGN(64); + __data_start = .; + *(.data) + *(.data.*) + *(.gnu.linkonce.d.*) + *(.jcr) + *(.got) + *(.got.plt) + __data_end = .; +} > psu_ocm_ram_0_MEM_0 + +.data1 : { + . = ALIGN(64); + __data1_start = .; + *(.data1) + *(.data1.*) + __data1_end = .; +} > psu_ocm_ram_0_MEM_0 + +.got : { + *(.got) +} > psu_ocm_ram_0_MEM_0 + +.got1 : { + *(.got1) +} > psu_ocm_ram_0_MEM_0 + +.got2 : { + *(.got2) +} > psu_ocm_ram_0_MEM_0 + +.note.gnu.build-id : { + KEEP (*(.note.gnu.build-id)) +} > psu_ocm_ram_0_MEM_0 + +.ctors : { + . = ALIGN(64); + __CTOR_LIST__ = .; + ___CTORS_LIST___ = .; + KEEP (*crtbegin.o(.ctors)) + KEEP (*(EXCLUDE_FILE(*crtend.o) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + __CTOR_END__ = .; + ___CTORS_END___ = .; +} > psu_ocm_ram_0_MEM_0 + +.dtors : { + . = ALIGN(64); + __DTOR_LIST__ = .; + ___DTORS_LIST___ = .; + KEEP (*crtbegin.o(.dtors)) + KEEP (*(EXCLUDE_FILE(*crtend.o) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + __DTOR_END__ = .; + ___DTORS_END___ = .; +} > psu_ocm_ram_0_MEM_0 + +.fixup : { + __fixup_start = .; + *(.fixup) + __fixup_end = .; +} > psu_ocm_ram_0_MEM_0 + +.eh_frame : { + *(.eh_frame) +} > psu_ocm_ram_0_MEM_0 + +.eh_framehdr : { + __eh_framehdr_start = .; + *(.eh_framehdr) + __eh_framehdr_end = .; +} > psu_ocm_ram_0_MEM_0 + +.gcc_except_table : { + *(.gcc_except_table) +} > psu_ocm_ram_0_MEM_0 + +.mmu_tbl0 (ALIGN(4096)) : { + __mmu_tbl0_start = .; + *(.mmu_tbl0) + __mmu_tbl0_end = .; +} > psu_ocm_ram_0_MEM_0 + +.mmu_tbl1 (ALIGN(4096)) : { + __mmu_tbl1_start = .; + *(.mmu_tbl1) + __mmu_tbl1_end = .; +} > psu_ocm_ram_0_MEM_0 + +.mmu_tbl2 (ALIGN(4096)) : { + __mmu_tbl2_start = .; + *(.mmu_tbl2) + __mmu_tbl2_end = .; +} > psu_ocm_ram_0_MEM_0 + +.ARM.exidx : { + __exidx_start = .; + *(.ARM.exidx*) + *(.gnu.linkonce.armexidx.*.*) + __exidx_end = .; +} > psu_ocm_ram_0_MEM_0 + +.preinit_array : { + . = ALIGN(64); + __preinit_array_start = .; + KEEP (*(SORT(.preinit_array.*))) + KEEP (*(.preinit_array)) + __preinit_array_end = .; +} > psu_ocm_ram_0_MEM_0 + +.init_array : { + . = ALIGN(64); + __init_array_start = .; + KEEP (*(SORT(.init_array.*))) + KEEP (*(.init_array)) + __init_array_end = .; +} > psu_ocm_ram_0_MEM_0 + +.fini_array : { + . = ALIGN(64); + __fini_array_start = .; + KEEP (*(SORT(.fini_array.*))) + KEEP (*(.fini_array)) + __fini_array_end = .; +} > psu_ocm_ram_0_MEM_0 + +.ARM.attributes : { + __ARM.attributes_start = .; + *(.ARM.attributes) + __ARM.attributes_end = .; +} > psu_ocm_ram_0_MEM_0 + +.sdata : { + . = ALIGN(64); + __sdata_start = .; + *(.sdata) + *(.sdata.*) + *(.gnu.linkonce.s.*) + __sdata_end = .; +} > psu_ocm_ram_0_MEM_0 + +.tdata : { + . = ALIGN(64); + __tdata_start = .; + *(.tdata) + *(.tdata.*) + *(.gnu.linkonce.td.*) + __tdata_end = .; +} > psu_ocm_ram_0_MEM_0 + +.tbss : { + . = ALIGN(64); + __tbss_start = .; + *(.tbss) + *(.tbss.*) + *(.gnu.linkonce.tb.*) + __tbss_end = .; +} > psu_ocm_ram_0_MEM_0 + + +.sbss (NOLOAD) : { + . = ALIGN(64); + __sbss_start = .; + *(.sbss) + *(.sbss.*) + *(.gnu.linkonce.sb.*) + . = ALIGN(64); + __sbss_end = .; +} > psu_ocm_ram_0_MEM_0 + +.bss (NOLOAD) : { + . = ALIGN(64); + __bss_start__ = .; + *(.bss) + *(.bss.*) + *(.gnu.linkonce.b.*) + *(COMMON) + . = ALIGN(64); + __bss_end__ = .; +} > psu_ocm_ram_0_MEM_0 + +_SDA_BASE_ = __sdata_start + ((__sbss_end - __sdata_start) / 2 ); + +_SDA2_BASE_ = __sdata2_start + ((__sbss2_end - __sdata2_start) / 2 ); + +/* Generate Stack and Heap definitions */ + +.heap (NOLOAD) : { + . = ALIGN(64); + _heap = .; + HeapBase = .; + _heap_start = .; + . += _HEAP_SIZE; + _heap_end = .; + HeapLimit = .; +} > psu_ocm_ram_0_MEM_0 + +.stack (NOLOAD) : { + . = ALIGN(64); + _el3_stack_end = .; + . += _STACK_SIZE; + __el3_stack = .; + _el2_stack_end = .; + . += _EL2_STACK_SIZE; + . = ALIGN(64); + __el2_stack = .; + _el1_stack_end = .; + . += _EL1_STACK_SIZE; + . = ALIGN(64); + __el1_stack = .; + _el0_stack_end = .; + . += _EL0_STACK_SIZE; + . = ALIGN(64); + __el0_stack = .; +} > psu_ocm_ram_0_MEM_0 + +PROVIDE(_stack_base = .); + +_end = .; +} diff --git a/hal/zynqmp_psu_shim.c b/hal/zynqmp_psu_shim.c new file mode 100644 index 0000000000..7da7f6f82c --- /dev/null +++ b/hal/zynqmp_psu_shim.c @@ -0,0 +1,282 @@ +/* zynqmp_psu_shim.c + * + * Copyright (C) 2025 wolfSSL Inc. + * + * This file is part of wolfBoot. + * + * wolfBoot is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfBoot is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +/* Delay primitives backing the shim used by the board-supplied + * psu_init_gpl.c when wolfBoot runs as the ZynqMP FSBL replacement. + * + * These use the ARMv8 generic timer (CNTPCT_EL0 / CNTFRQ_EL0). The EL3 startup + * (boot_aarch64_start.S) programs CNTFRQ_EL0 and the system counter runs from + * the PS reference clock out of reset, so these are usable from the very first + * psu_init() stage, before the PLLs/clocks have been reprogrammed. */ + +#include "sleep.h" +#include "xil_io.h" + +/* Xilinx BSP logging shim. The board psu_init_gpl.c calls xil_printf() on a few + * error paths; psu_init runs before the UART is initialized, so this is a + * no-op. Route to wolfBoot_printf() once the UART is up if you need the + * diagnostics. */ +int xil_printf(const char* ctrl1, ...) +{ + (void)ctrl1; + return 0; +} + +/* ZynqMP IOU system timestamp counter (drives CNTPCT_EL0) and its fixed + * timestamp clock frequency (XPAR_CPU_CORTEXA53_0_TIMESTAMP_CLK_FREQ = 100MHz, + * set by psu_clock_init_data's TIMESTAMP_REF_CTRL). */ +#define IOU_SCNTRS_CNT_CONTROL 0xFF260000UL +#define IOU_SCNTRS_FREQ_REG 0xFF260020UL +#define IOU_SCNTRS_EN 0x00000001UL +#ifndef ZYNQMP_TIMESTAMP_HZ +#define ZYNQMP_TIMESTAMP_HZ 100000000UL +#endif + +/* Equivalent of the Xilinx BSP XTime_StartTimer(): enable the IOU_SCNTRS + * system counter (so CNTPCT_EL0 increments) if it is not already enabled. The + * BootROM/early psu_init leaves it DISABLED -- the generated psu_init only + * enables it in psu_peripherals_init_data, which runs AFTER DDR training -- + * yet psu_ddr_phybringup_data needs working usleep delays. The Xilinx BSP + * usleep calls this on every delay for exactly this reason. */ +static void zynqmp_start_timer(void) +{ + if ((Xil_In32(IOU_SCNTRS_CNT_CONTROL) & IOU_SCNTRS_EN) != IOU_SCNTRS_EN) { + Xil_Out32(IOU_SCNTRS_FREQ_REG, ZYNQMP_TIMESTAMP_HZ); + Xil_Out32(IOU_SCNTRS_CNT_CONTROL, IOU_SCNTRS_EN); + } +} + +static unsigned long psu_timer_count(void) +{ + unsigned long cntpct; + asm volatile("mrs %0, cntpct_el0" : "=r"(cntpct)); + return cntpct; +} + +void usleep(unsigned long useconds) +{ + unsigned long start; + unsigned long ticks; + unsigned long freq = ZYNQMP_TIMESTAMP_HZ; + +#ifdef WOLFBOOT_ZYNQMP_FSBL + /* Match the Xilinx BSP: ensure the system counter is running before use. */ + zynqmp_start_timer(); +#endif +#ifdef ZYNQMP_USLEEP_SCALE + useconds *= (unsigned long)(ZYNQMP_USLEEP_SCALE); +#endif + start = psu_timer_count(); + /* ticks = useconds * freq / 1e6, split to avoid 64-bit overflow for the + * small delays psu_init uses. */ + ticks = (useconds / 1000000UL) * freq + + ((useconds % 1000000UL) * freq) / 1000000UL; + + while ((psu_timer_count() - start) < ticks) { + /* busy-wait */ + } +} + +unsigned int sleep(unsigned int seconds) +{ + unsigned int i; + for (i = 0; i < seconds; i++) { + usleep(1000000UL); + } + return 0; +} + +#ifdef WOLFBOOT_ZYNQMP_FSBL + +/* Board psu_init sub-stages (non-static in the XSA-generated psu_init_gpl.c). + * Declared here so we do not pull in the huge board psu_init_gpl.h. */ +extern unsigned long psu_mio_init_data(void); +extern unsigned long psu_peripherals_pre_init_data(void); +extern unsigned long psu_pll_init_data(void); +extern unsigned long psu_clock_init_data(void); +extern unsigned long psu_ddr_init_data(void); +extern unsigned long psu_ddr_phybringup_data(void); +extern unsigned long psu_peripherals_init_data(void); +extern unsigned long psu_resetin_init_data(void); +extern unsigned long psu_serdes_init_data(void); +extern unsigned long psu_resetout_init_data(void); +/* serdes signal-integrity calibration. serdes_fixcal_code() is non-static in + * the board file; serdes_enb_coarse_saturation() is static (inlined below). */ +extern int serdes_fixcal_code(void); +extern unsigned long psu_peripherals_powerdwn_data(void); +extern unsigned long psu_afi_config(void); +extern unsigned long psu_ddr_qos_init_data(void); + +/* Actual system-counter (CNTPCT_EL0) rate during early psu_init. The BootROM + * leaves the IOU_SCNTRS timestamp counter running at the UN-divided rate; the + * /15 divisor psu_clock_init_data programs (TIMESTAMP_REF_CTRL -> 100 MHz) + * does not re-latch the already-running counter, so CNTPCT increments at the + * ~1.5 GHz pre-divider rate (15 x 100 MHz). Hardware-measured via JTAG DDR + * walks: baseline usleep delivered ~1/15 of the requested time, and a 16x + * scale restored correct DDR training. Setting CNTFRQ_EL0 to this rate makes + * usleep() deliver the intended microseconds. Slightly conservative (1.6 GHz) + * so the DDR PHY settle delays are >= intended. Do NOT toggle the counter + * enable here -- disabling it freezes CNTPCT and hangs usleep. */ +#define ZYNQMP_EARLY_CNTPCT_HZ 1600000000UL + +/* Diagnostic capture of the early (BootROM) timer/clock state, printed by + * hal_init() once the UART is up. Helps determine the true system-counter + * rate from a real cold boot (not a JTAG load). */ +unsigned long zynqmp_dbg_cntfrq_boot = 0; /* CNTFRQ_EL0 left by the BootROM */ +unsigned long zynqmp_dbg_cntfrq_used = 0; /* value usleep() ends up using */ +unsigned int zynqmp_dbg_ts_ctrl = 0; /* CRL TIMESTAMP_REF_CTRL 0xFF5E0128 */ +unsigned int zynqmp_dbg_iopll_ctrl = 0; /* CRL IOPLL_CTRL 0xFF5E0020 */ +unsigned int zynqmp_dbg_scntr_ctrl = 0; /* IOU_SCNTRS COUNTER_CONTROL */ +unsigned int zynqmp_dbg_scntr_freq = 0; /* IOU_SCNTRS BASE_FREQUENCY */ +volatile unsigned long zynqmp_dbg_cntpct_live = 0; /* live CNTPCT (measure spin) */ + +static void zynqmp_fix_timer_freq(void) +{ + unsigned long f = 0; + + /* Capture the BootROM timer/clock state BEFORE psu_init reprograms it. + * Only CRL_APB registers (always clocked) are safe to read this early; + * the IOU_SCNTRS block (0xFF260000) is NOT accessible until psu_init + * clocks it and reading it here bus-stalls the core. */ + __asm__ volatile("mrs %0, cntfrq_el0" : "=r"(f)); + zynqmp_dbg_cntfrq_boot = f; + zynqmp_dbg_ts_ctrl = Xil_In32(0xFF5E0128UL); + zynqmp_dbg_iopll_ctrl = Xil_In32(0xFF5E0020UL); + + /* The IOU_SCNTRS system counter runs at the fixed 100MHz timestamp clock + * (set by psu_clock_init_data) once enabled. usleep() enables it and uses + * 100MHz directly; set CNTFRQ_EL0 to match so hal_get_timer_us()/ + * hal_timer_ms() are also correct. */ + f = ZYNQMP_TIMESTAMP_HZ; + __asm__ volatile("msr cntfrq_el0, %0\n\t isb" : : "r"(f)); + zynqmp_dbg_cntfrq_used = f; +} + +/* Replacement for the board file's psu_init(). Mirrors its call sequence (see + * psu_init_gpl.c) but inserts zynqmp_fix_timestamp_100mhz() right after the + * clock init and before DDR init/training, so the training settle delays are + * accurate. The static init_serdes()/init_peripheral() in the board file are + * inlined here via their non-static sub-stages; the serdes signal-integrity + * calibration helpers (serdes_fixcal_code/serdes_enb_coarse_saturation) are + * static and omitted -- they are not needed for QSPI/SD/RGMII boot (no PS-GTR + * lanes), and Linux re-initializes the PS-GTR if used. Returns 0 on success. */ +/* JTAG-readable psu_init progress marker. UART is not up during early + * psu_init, so write a stage number to a PMU_GLOBAL general-storage register + * (0xFFD80030, GEN_STORAGE0) before each sub-stage. After a cold-boot hang, + * read 0xFFD80030 over JTAG to see the last sub-stage that STARTED. */ +#define ZYNQMP_PSU_STAGE_REG 0xFFD80030UL +#define PSU_STAGE(n) Xil_Out32(ZYNQMP_PSU_STAGE_REG, (u32)(n)) + +int zynqmp_psu_init(void) +{ + int status = 1; + u32 smmu; + + PSU_STAGE(0xA000); + /* Correct CNTFRQ_EL0 up front so EVERY psu_init stage (PLL/clock/DDR) gets + * accurate usleep settle delays. The BootROM leaves the system counter at + * the ~1.5 GHz pre-divider rate for the whole psu_init sequence (the /15 + * divisor never re-latches the running counter), so this single correction + * holds across all stages. */ + zynqmp_fix_timer_freq(); + + PSU_STAGE(0xA001); status &= psu_mio_init_data(); + PSU_STAGE(0xA002); status &= psu_peripherals_pre_init_data(); + PSU_STAGE(0xA003); status &= psu_pll_init_data(); + PSU_STAGE(0xA004); status &= psu_clock_init_data(); + + PSU_STAGE(0xA005); status &= psu_ddr_init_data(); + PSU_STAGE(0xA006); status &= psu_ddr_phybringup_data(); + PSU_STAGE(0xA007); + +#ifdef ZYNQMP_MEASURE_SPIN + /* Freeze here (right after DDR training, before peripherals_init changes + * the timestamp divisor) and continuously publish live CNTPCT_EL0 into an + * OCM global. The counter is enabled by now (DDR usleeps ran). JTAG reads + * the global twice across a host-timed window to measure the actual + * training-window counter rate. */ + { + volatile unsigned long c; + zynqmp_start_timer(); + for (;;) { + __asm__ volatile("mrs %0, cntpct_el0" : "=r"(c)); + zynqmp_dbg_cntpct_live = c; + } + } +#endif + + PSU_STAGE(0xA008); status &= psu_peripherals_init_data(); + + /* init_serdes() equivalent. The serdes/PS-GTR block (USB/SATA/PCIe/DP + * lane reset + serdes_illcalib + SERDES_Ln PLL-lock mask_poll in + * psu_resetout) is NOT required to boot from QSPI/SD into DDR and hand off + * to Linux -- none of those high-speed peripherals are in the boot path, + * and Linux re-initializes the PS-GTR via its own phy driver. It is skipped + * by default; define ZYNQMP_PSU_INIT_SERDES to run it (needed if the kernel + * uses a PS-GTR peripheral -- e.g. the USB3 dwc3 controller, whose probe + * otherwise hangs waiting on an unclocked PHY). + * + * This replicates the board init_serdes() FAITHFULLY. The earlier bare + * resetin -> serdes -> resetout hung because it omitted the two serdes + * calibration helpers the Xilinx FSBL runs first: without them the SERDES + * PLLs never lock and the resetout PLL-lock mask_polls each burn the full + * PSU_MASK_POLL_TIME per lane. serdes_fixcal_code() is non-static so we call + * it; serdes_enb_coarse_saturation() is static in the XSA-generated board + * file, so its body (four "enable PLL coarse-code saturation" writes) is + * inlined here rather than modifying psu_init_gpl.c. serdes_illcalib() is + * already called inside psu_serdes_init_data(). */ +#ifdef ZYNQMP_PSU_INIT_SERDES + PSU_STAGE(0xA009); + status &= psu_resetin_init_data(); + PSU_STAGE(0xB000); + status &= serdes_fixcal_code(); + /* serdes_enb_coarse_saturation(): enable PLL coarse-code saturation logic */ + Xil_Out32(0xFD402094UL, 0x00000010UL); + Xil_Out32(0xFD406094UL, 0x00000010UL); + Xil_Out32(0xFD40A094UL, 0x00000010UL); + Xil_Out32(0xFD40E094UL, 0x00000010UL); + PSU_STAGE(0xB001); status &= psu_serdes_init_data(); + PSU_STAGE(0xB002); status &= psu_resetout_init_data(); + PSU_STAGE(0xB003); +#endif + + /* init_peripheral(): SMMU interrupt enable (read-modify-write set of + * bits 0x8000001F, matching the board PSU_Mask_Write). This is NOT part of + * init_serdes(), so it runs even when the PS-GTR serdes block is skipped. */ + smmu = Xil_In32(0xFD5F0018UL); + smmu &= ~0x8000001FUL; + smmu |= 0x8000001FUL; + Xil_Out32(0xFD5F0018UL, smmu); + PSU_STAGE(0xB004); + + PSU_STAGE(0xA00A); + status &= psu_peripherals_powerdwn_data(); + status &= psu_afi_config(); + psu_ddr_qos_init_data(); + + PSU_STAGE(0xA0DD); /* psu_init fully complete */ + if (status == 0) + return 1; + return 0; +} + +#endif /* WOLFBOOT_ZYNQMP_FSBL */ diff --git a/include/wolfboot_smc.h b/include/wolfboot_smc.h new file mode 100644 index 0000000000..c3ab1c15ff --- /dev/null +++ b/include/wolfboot_smc.h @@ -0,0 +1,82 @@ +/* wolfboot_smc.h + * + * Copyright (C) 2025 wolfSSL Inc. + * + * This file is part of wolfBoot. + * + * wolfBoot is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfBoot is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +/* SMC ABI between the wolfBoot EL3 secure monitor (server) and the normal + * world (client). This is the shared contract used by both sides: + * + * - The crypto / key / secure-storage surface is carried by the wolfHSM + * client/server protocol over a shared-memory transport (wh_transport_mem). + * SMC is used only as a doorbell: the client writes a request into the + * shared buffer, issues WOLFBOOT_SMC_FID_HSM_DOORBELL, and the EL3 server + * processes exactly one wolfHSM request and writes the response back. + * - A few small fixed-ABI calls (firmware update trigger/status, version) + * do not need the wolfHSM channel. + * + * Function IDs use the SMCCC Fast Call, SMC64, OEM range (0xC3000000+) to + * avoid the Xilinx SiP range (0xC2000000, used by PM_SIP_SVC in hal/zynq.c). + */ + +#ifndef WOLFBOOT_SMC_H +#define WOLFBOOT_SMC_H + +/* SMCCC Fast Call / SMC64 / OEM service range base. */ +#define WOLFBOOT_SMC_FID_BASE 0xC3000000U + +/* Doorbell: a wolfHSM request is ready in the shared buffer. The server runs + * one wh_Server_HandleRequestMessage() pass. Returns WOLFBOOT_SMC_OK on a + * processed request, WOLFBOOT_SMC_NOT_READY if none was pending. */ +#define WOLFBOOT_SMC_FID_HSM_DOORBELL (WOLFBOOT_SMC_FID_BASE + 0x00U) + +/* Firmware update / boot control (fixed ABI, no wolfHSM channel). */ +#define WOLFBOOT_SMC_FID_FW_UPDATE (WOLFBOOT_SMC_FID_BASE + 0x01U) +#define WOLFBOOT_SMC_FID_FW_STATUS (WOLFBOOT_SMC_FID_BASE + 0x02U) +#define WOLFBOOT_SMC_FID_VERSION (WOLFBOOT_SMC_FID_BASE + 0x03U) + +/* SMC return codes (in x0). Aligned with SMCCC conventions: 0 success, + * negative for errors. */ +#define WOLFBOOT_SMC_OK (0) +#define WOLFBOOT_SMC_NOT_READY (-1) +#define WOLFBOOT_SMC_NOT_SUPPORTED (-2) +#define WOLFBOOT_SMC_ERROR (-3) + +/* Shared-memory region for the wolfHSM transport (wh_transport_mem). Carved + * out of DDR and mapped on both sides; must be reserved from the OS via the + * device tree (/reserved-memory) so Linux does not use it. The region holds + * the request buffer followed by the response buffer. + * + * Override at build time with -DWOLFBOOT_HSM_SHM_BASE / _SIZE to match the + * DTB reservation and the wolfHSM whTransportMemConfig on both ends. */ +#ifndef WOLFBOOT_HSM_SHM_BASE +#define WOLFBOOT_HSM_SHM_BASE 0x7F000000UL +#endif +#ifndef WOLFBOOT_HSM_SHM_SIZE +#define WOLFBOOT_HSM_SHM_SIZE 0x00010000UL /* 64 KB total */ +#endif + +/* Split the region into request and response halves for wh_transport_mem. */ +#define WOLFBOOT_HSM_SHM_REQ_BASE (WOLFBOOT_HSM_SHM_BASE) +#define WOLFBOOT_HSM_SHM_REQ_SIZE (WOLFBOOT_HSM_SHM_SIZE / 2U) +#define WOLFBOOT_HSM_SHM_RESP_BASE (WOLFBOOT_HSM_SHM_BASE + \ + WOLFBOOT_HSM_SHM_REQ_SIZE) +#define WOLFBOOT_HSM_SHM_RESP_SIZE (WOLFBOOT_HSM_SHM_SIZE / 2U) + +#endif /* WOLFBOOT_SMC_H */ diff --git a/src/boot_aarch64.c b/src/boot_aarch64.c index cf8d074796..c418cb44d9 100644 --- a/src/boot_aarch64.c +++ b/src/boot_aarch64.c @@ -242,7 +242,36 @@ void RAMFUNCTION arch_reboot(void) * ============================================================================ */ -#if defined(DEBUG_HARDFAULT) && defined(DEBUG_UART) && defined(EL2_HYPERVISOR) +#if defined(EL3_SECURE) && EL3_SECURE == 1 + +/* EL3 exception reporting. Without this, a data abort / SError taken at EL3 + * lands in the silent wfi stub below and looks like a hang. Print the syndrome + * so the fault class (ESR_EL3.EC) and faulting address (FAR_EL3) are visible. */ +static void print_exception_info_el3(const char *type) +{ + unsigned long esr = 0, elr = 0, far = 0; + __asm__ volatile("mrs %0, ESR_EL3" : "=r"(esr)); + __asm__ volatile("mrs %0, ELR_EL3" : "=r"(elr)); + __asm__ volatile("mrs %0, FAR_EL3" : "=r"(far)); + wolfBoot_printf("\n*** %s EXCEPTION (EL3) ***\n", type); + wolfBoot_printf("ESR_EL3: 0x%08x%08x\n", + (uint32_t)(esr >> 32), (uint32_t)esr); + wolfBoot_printf("ELR_EL3: 0x%08x%08x\n", + (uint32_t)(elr >> 32), (uint32_t)elr); + wolfBoot_printf("FAR_EL3: 0x%08x%08x\n", + (uint32_t)(far >> 32), (uint32_t)far); +} + +void SynchronousInterrupt(void) + { print_exception_info_el3("SYNCHRONOUS"); while (1) { __asm__ volatile("wfi"); } } +void IRQInterrupt(void) + { print_exception_info_el3("IRQ"); while (1) { __asm__ volatile("wfi"); } } +void FIQInterrupt(void) + { print_exception_info_el3("FIQ"); while (1) { __asm__ volatile("wfi"); } } +void SErrorInterrupt(void) + { print_exception_info_el3("SERROR"); while (1) { __asm__ volatile("wfi"); } } + +#elif defined(DEBUG_HARDFAULT) && defined(DEBUG_UART) && defined(EL2_HYPERVISOR) #define READ_SYSREG(_out, _reg) __asm__ volatile("mrs %0, " #_reg : "=r"(_out)) diff --git a/src/boot_aarch64_start.S b/src/boot_aarch64_start.S index 544b5e8de9..e4e1e607cd 100644 --- a/src/boot_aarch64_start.S +++ b/src/boot_aarch64_start.S @@ -213,9 +213,15 @@ InitEL3: #endif msr S3_1_C15_C2_0, x0 /* CPUACTLR_EL1 */ - /* Program the counter frequency */ + /* Program the counter frequency. + * For the FSBL-replacement build, do NOT overwrite CNTFRQ_EL0: the BootROM + * sets it to the actual system-counter rate, which our usleep() needs for + * accurate psu_init DDR-training delays. Hardcoding 100MHz here is wrong + * when the counter runs at the BootROM (undivided) rate. */ +#ifndef WOLFBOOT_ZYNQMP_FSBL ldr x0,=counterfreq msr CNTFRQ_EL0, x0 +#endif /* Enable hardware coherency between cores */ mrs x0, S3_1_c15_c2_1 /* Read EL1 CPU Extended Control Register */ @@ -557,7 +563,13 @@ invalidatecaches_end: */ .set reserved, 0x0 /* Fault */ -#if defined(EL1_NONSECURE) && EL1_NONSECURE == 1 +#if defined(ZYNQMP_DDR_NC) && ZYNQMP_DDR_NC == 1 +.set Memory, 0x401 | (0 << 8) | (0x0) /* DDR as Normal NON-cacheable (MAIR idx0=0x44) - diag */ +#elif defined(ZYNQMP_DDR_DEVICE) && ZYNQMP_DDR_DEVICE == 1 +.set Memory, 0x409 | (1 << 53) | (1 << 54) /* DDR as Device-nGnRnE (no speculation) - diag */ +#elif defined(ZYNQMP_DDR_NONSHARED) && ZYNQMP_DDR_NONSHARED == 1 +.set Memory, 0x405 | (0 << 8) | (0x0) /* normal writeback write allocate NON-shareable read write */ +#elif defined(EL1_NONSECURE) && EL1_NONSECURE == 1 .set Memory, 0x405 | (2 << 8) | (0x0) /* normal writeback write allocate outer shared read write */ #else .set Memory, 0x405 | (3 << 8) | (0x0) /* normal writeback write allocate inner shared read write */ @@ -1212,6 +1224,74 @@ flush_dcache_range: ret +/* + * void zynqmp_dcache_disable(void) + * + * Clean+invalidate the entire data cache to PoC by set/way, then disable the + * D-cache (SCTLR_EL3.C=0) and drop the core out of the coherency domain + * (CPUECTLR_EL1.SMPEN=0). Used to TEST whether the QSPI body-load failure is + * purely a coherency artifact: with no cache lines to allocate into and the + * core out of the snoop domain, the coherent QSPIDMA writes must land in DDR. + * Clobbers x0-x11. Modeled on the ARMv8 set/way routine used elsewhere here. + */ +.global zynqmp_dcache_disable +zynqmp_dcache_disable: + mrs x0, clidr_el1 + and x3, x0, #0x07000000 + lsr x3, x3, #23 /* x3 = LoC * 2 */ + cbz x3, .Lzdd_done + mov x10, #0 +.Lzdd_level: + add x2, x10, x10, lsr #1 + lsr x1, x0, x2 + and x1, x1, #7 + cmp x1, #2 + b.lt .Lzdd_skip + msr csselr_el1, x10 + isb + mrs x1, ccsidr_el1 + and x2, x1, #7 + add x2, x2, #4 /* x2 = log2(line len) */ + mov x4, #0x3ff + and x4, x4, x1, lsr #3 /* x4 = max way */ + clz w5, w4 + mov x7, #0x7fff + and x7, x7, x1, lsr #13 /* x7 = max set */ +.Lzdd_set: + mov x9, x4 +.Lzdd_way: + lsl x6, x9, x5 + orr x11, x10, x6 + lsl x6, x7, x2 + orr x11, x11, x6 + dc cisw, x11 + subs x9, x9, #1 + b.ge .Lzdd_way + subs x7, x7, #1 + b.ge .Lzdd_set +.Lzdd_skip: + add x10, x10, #2 + cmp x3, x10 + b.gt .Lzdd_level +.Lzdd_done: + mov x10, #0 + msr csselr_el1, x10 + dsb sy + isb + /* disable D-cache (SCTLR_EL3.C, bit 2) */ + mrs x1, sctlr_el3 + bic x1, x1, #(1 << 2) + msr sctlr_el3, x1 + isb + /* leave the SMP/coherency domain (CPUECTLR_EL1.SMPEN, bit 6) */ + mrs x1, S3_1_C15_C2_1 + bic x1, x1, #(1 << 6) + msr S3_1_C15_C2_1, x1 + dsb sy + isb + ret + + /* Initialize GIC 400 (GICv2) */ .global gicv2_init_secure gicv2_init_secure: @@ -1448,4 +1528,91 @@ el2_flush_and_disable_mmu: ret #endif /* EL2_HYPERVISOR */ +#if defined(WOLFBOOT_ZYNQMP_FSBL) +/* + * void el3_to_atf_boot(uintptr_t bl31_entry) + * + * Final handoff from wolfBoot (running as the ZynqMP FSBL at EL3) to ARM + * Trusted Firmware BL31. Cleans & invalidates the entire D-cache to the PoC, + * invalidates the I-cache, disables the EL3 MMU + I/D-caches, then branches to + * the BL31 entry point. BL31 expects to be entered at EL3 with the MMU off + * (mirrors what the stock FSBL's XFsbl_Exit() does). + * + * The ATF handoff parameters (BL33 entry/EL) are passed to BL31 separately via + * the PMU_GLOBAL.GLOBAL_GEN_STORAGE6 register (written by hal/zynqmp_atf.c + * before calling this), so no GP-register arguments are needed here. + * + * x0: bl31_entry. The set/way loop below clobbers x0-x11, so the entry point + * is preserved in x19 (callee-saved, untouched by the loop). Does not return. + */ +.global el3_to_atf_boot +el3_to_atf_boot: + mov x19, x0 /* preserve BL31 entry across the loop */ + + /* ---- 1. Clean & invalidate entire data cache to PoC by set/way ---- */ + mrs x0, clidr_el1 + and x3, x0, #0x07000000 /* x3 = LoC (level of coherency) */ + lsr x3, x3, #23 /* x3 = LoC * 2 */ + cbz x3, .Latf_dcache_done + mov x10, #0 /* x10 = current cache level << 1 */ + +.Latf_dcache_level_loop: + add x2, x10, x10, lsr #1 /* x2 = level * 3 */ + lsr x1, x0, x2 /* x1 = ctype field for this level */ + and x1, x1, #7 + cmp x1, #2 + b.lt .Latf_dcache_skip_level /* No data cache at this level */ + msr csselr_el1, x10 /* Select cache level (instruction = 0) */ + isb + mrs x1, ccsidr_el1 + and x2, x1, #7 /* x2 = log2(line length) - 4 */ + add x2, x2, #4 /* x2 = log2(line length) */ + mov x4, #0x3ff + and x4, x4, x1, lsr #3 /* x4 = max way number */ + clz w5, w4 /* x5 = bit position of way size */ + mov x7, #0x7fff + and x7, x7, x1, lsr #13 /* x7 = max set number */ + +.Latf_dcache_set_loop: + mov x9, x4 /* x9 = current way */ +.Latf_dcache_way_loop: + lsl x6, x9, x5 + orr x11, x10, x6 /* level | way */ + lsl x6, x7, x2 + orr x11, x11, x6 /* level | way | set */ + dc cisw, x11 /* clean & invalidate by set/way */ + subs x9, x9, #1 + b.ge .Latf_dcache_way_loop + subs x7, x7, #1 + b.ge .Latf_dcache_set_loop + +.Latf_dcache_skip_level: + add x10, x10, #2 + cmp x3, x10 + b.gt .Latf_dcache_level_loop + +.Latf_dcache_done: + mov x10, #0 + msr csselr_el1, x10 + dsb sy + isb + + /* ---- 2. Invalidate entire I-cache to PoU ---- */ + ic iallu + dsb ish + isb + + /* ---- 3. Disable MMU + I-cache + D-cache at EL3 ---- */ + mrs x0, SCTLR_EL3 + bic x0, x0, #(1 << 0) /* M */ + bic x0, x0, #(1 << 2) /* C */ + bic x0, x0, #(1 << 12) /* I */ + msr SCTLR_EL3, x0 + dsb sy + isb + + /* ---- 4. Branch to BL31 (does not return) ---- */ + br x19 +#endif /* WOLFBOOT_ZYNQMP_FSBL */ + .end diff --git a/src/update_disk.c b/src/update_disk.c index 767df7c266..ad0f971d73 100644 --- a/src/update_disk.c +++ b/src/update_disk.c @@ -47,6 +47,9 @@ #ifdef WOLFBOOT_ELF #include "elf.h" #endif +#if defined(WOLFBOOT_ZYNQMP_FSBL) && defined(MMU) +#include "../hal/zynqmp_atf.h" +#endif /* Disk encryption support for AES-256, AES-128, or ChaCha20 */ #if defined(ENCRYPT_WITH_AES256) || defined(ENCRYPT_WITH_AES128) || \ @@ -282,6 +285,10 @@ void RAMFUNCTION wolfBoot_start(void) #ifdef WOLFBOOT_FDT uint32_t dts_size = 0; #endif +#endif +#if defined(WOLFBOOT_ZYNQMP_FSBL) && defined(MMU) + /* BL31 (ARM-TF) entry, set when the boot FIT carries an "atf" sub-image. */ + uintptr_t bl31_entry = 0; #endif char part_name[4] = {'P', ':', 'X', '\0'}; BENCHMARK_DECLARE(); @@ -600,6 +607,16 @@ void RAMFUNCTION wolfBoot_start(void) } load_address = new_load; } +#if defined(WOLFBOOT_ZYNQMP_FSBL) + /* Load BL31 (ARM-TF) from the FIT "atf" sub-image, if present. */ + { + void *atf_load = fit_load_image(fit, "atf", NULL); + if (atf_load != NULL) { + bl31_entry = (uintptr_t)atf_load; + wolfBoot_printf("FIT: BL31 (atf) loaded at %p\r\n", atf_load); + } + } +#endif if (flat_dt != NULL) { uint8_t *dts_ptr = fit_load_image(fit, flat_dt, (int*)&dts_size); if (dts_ptr != NULL && wolfBoot_get_dts_size(dts_ptr) >= 0) { @@ -657,6 +674,14 @@ void RAMFUNCTION wolfBoot_start(void) #ifdef DISK_ENCRYPT disk_decrypted_header_clear(dec_hdr); disk_crypto_clear(); +#endif +#if defined(WOLFBOOT_ZYNQMP_FSBL) && defined(MMU) + if (bl31_entry != 0) { + wolfBoot_printf("Handing off to BL31 at 0x%x (kernel 0x%x)\r\n", + (uint32_t)bl31_entry, (uint32_t)(uintptr_t)load_address); + zynqmp_atf_handoff(bl31_entry, (uintptr_t)load_address, + (uintptr_t)dts_addr, ZYNQMP_ATF_EL2); + } #endif do_boot((uint32_t*)load_address #ifdef MMU diff --git a/src/update_ram.c b/src/update_ram.c index 9407e0a93a..9006316ac3 100644 --- a/src/update_ram.c +++ b/src/update_ram.c @@ -41,6 +41,9 @@ #ifdef WOLFBOOT_ELF #include "elf.h" #endif +#if defined(WOLFBOOT_ZYNQMP_FSBL) && defined(MMU) +#include "../hal/zynqmp_atf.h" +#endif extern void hal_flash_dualbank_swap(void); extern int wolfBoot_get_dts_size(void *dts_addr); @@ -69,6 +72,17 @@ int wolfBoot_ramboot(struct wolfBoot_image *img, uint8_t *src, uint8_t *dst) uint32_t img_size; BENCHMARK_DECLARE(); +#ifdef ZYNQMP_DCACHE_OFF_LOAD + /* TEST: clean+invalidate+disable the APU D-cache and leave the coherency + * domain before loading. If the QSPI body load then verifies, the failure + * is purely the coherent QSPIDMA landing in cache instead of DDR. */ + { + extern void zynqmp_dcache_disable(void); + zynqmp_dcache_disable(); + wolfBoot_printf("DCACHE OFF for load (coherency test)\n"); + } +#endif + /* read header into RAM */ wolfBoot_printf("Loading header %d bytes from %p to %p\n", IMAGE_HEADER_SIZE, src, dst); @@ -113,23 +127,70 @@ int wolfBoot_ramboot(struct wolfBoot_image *img, uint8_t *src, uint8_t *dst) } #endif -#if defined(__WOLFBOOT) && defined(WOLFBOOT_LOAD_ADDRESS) - /* Runtime overlap check: ensure image destination does not overwrite - * wolfBoot's own code/data/bss in RAM. */ - if ((uintptr_t)dst < (uintptr_t)_end) { - wolfBoot_printf("Error: image dest %p overlaps wolfBoot end %p\n", - dst, _end); - return -1; +#if defined(__WOLFBOOT) && defined(WOLFBOOT_LOAD_ADDRESS) && \ + defined(WOLFBOOT_ORIGIN) + /* Runtime overlap check: ensure the image destination does not overwrite + * wolfBoot's own code/data/bss. wolfBoot occupies [WOLFBOOT_ORIGIN, _end]; + * the image occupies [dst, dst + header + size]. Use a proper range + * intersection so it is correct whether wolfBoot is below the image (image + * loaded above it) or above it -- e.g. as the ZynqMP FSBL wolfBoot runs + * from OCM (high addresses) while the image loads to DDR (low addresses), + * where the old "dst < _end" test gave a false positive. */ + { + uintptr_t wb_lo = (uintptr_t)(WOLFBOOT_ORIGIN); + uintptr_t wb_hi = (uintptr_t)_end; + uintptr_t img_lo = (uintptr_t)dst; + uintptr_t img_hi = img_lo + (uintptr_t)IMAGE_HEADER_SIZE + + (uintptr_t)img_size; + if (img_lo < wb_hi && img_hi > wb_lo) { + wolfBoot_printf("Error: image %p-%p overlaps wolfBoot %p-%p\n", + (void*)img_lo, (void*)img_hi, (void*)wb_lo, (void*)wb_hi); + return -1; + } } #endif /* Read the entire image into RAM */ wolfBoot_printf("Loading image %d bytes from %p to %p...", img_size, src + IMAGE_HEADER_SIZE, dst + IMAGE_HEADER_SIZE); +#ifdef ZYNQMP_DDR_SCRUB + /* CPU-initialize ("scrub") the destination DDR before the DMA load. On + * ZynqMP, if DDR ECC is enabled the controller computes ECC on writes; a + * region that has never been CPU-written holds uninitialized ECC, and a + * non-ECC-aware DMA write / subsequent read returns wrong data. Writing the + * region from the CPU first establishes valid ECC. TEST gate. */ + { + volatile uint8_t* z = (volatile uint8_t*)dst; + uint32_t zi, zn = (uint32_t)IMAGE_HEADER_SIZE + img_size; + for (zi = 0; zi < zn; zi += 4) { + *((volatile uint32_t*)(z + zi)) = 0; + } + __asm__ volatile("dsb sy" ::: "memory"); + } +#endif BENCHMARK_START(); #if defined(EXT_FLASH) && defined(NO_XIP) +#ifdef ZYNQMP_FSBL_SPLIT_READ + /* The ZynqMP GQSPI zeroes the first ~128KB of any single ext_flash_read of + * >=32KB; reads <32KB always return correct data (hardware-measured size + * threshold). Split the body into <32KB chunks so every read is reliable. + * The flash address advances per call so the data is contiguous. */ + { + uintptr_t s = (uintptr_t)src + IMAGE_HEADER_SIZE; + uint8_t* d = dst + IMAGE_HEADER_SIZE; + uint32_t left = img_size; + ret = 0; + while (left > 0) { + uint32_t n = (left > 0x400U) ? 0x400U : left; /* 1KB */ + int r = ext_flash_read(s, d, n); + if (r < 0) { ret = r; break; } + s += n; d += n; left -= n; ret += r; + } + } +#else ret = ext_flash_read((uintptr_t)src + IMAGE_HEADER_SIZE, dst + IMAGE_HEADER_SIZE, img_size); +#endif if (ret < 0) { wolfBoot_printf("Error reading image at %p\n", src); return -1; @@ -139,6 +200,44 @@ int wolfBoot_ramboot(struct wolfBoot_image *img, uint8_t *src, uint8_t *dst) #endif BENCHMARK_END("done"); +#ifdef DEBUG_ZYNQ + /* Diagnostic: ext_flash_read return value + what actually landed in RAM + * at the start and tail of the loaded firmware (the integrity hash reads + * exactly this). ELF payloads start with 0x464c457f ("\x7fELF"). */ + { + volatile uint32_t* fw = (volatile uint32_t*)(dst + IMAGE_HEADER_SIZE); + uint32_t before, after; + wolfBoot_printf("ramboot: ret=%d sz=%d\n", ret, img_size); + wolfBoot_printf(" fw@0=%08x @0x2000=%08x @0x8000=%08x @0x10000=%08x\n", + (uint32_t)fw[0], (uint32_t)fw[0x2000/4], + (uint32_t)fw[0x8000/4], (uint32_t)fw[0x10000/4]); + wolfBoot_printf(" fw@0x20000=%08x @0x30000=%08x\n", + (uint32_t)fw[0x20000/4], (uint32_t)fw[0x30000/4]); + /* Cache-vs-DDR probe: read fw[0], then invalidate that cache line and + * read again. If the value CHANGES after invalidate, the first read was + * a stale cache line and DDR holds different (DMA'd) data -> coherency + * bug. If it stays the same, DDR physically holds this value. */ + (void)before; (void)after; + /* Per-16KB checksum of the loaded body vs the expected file values -- + * pinpoints the exact 16KB block where the load first diverges (bug #2). + * exp: 77950825 0 0 0 e29ff3b0 62b362b1 1403ee10 f91b6f24 02be396c + * 21ce34bf 73bf6bb1 9c609099 c7fa4209 65ebb688 (blocks 0..13). */ + { + uint32_t blk, i, csum; + for (blk = 0; blk * 0x4000U < img_size; blk++) { + csum = 0; + for (i = 0; i < 0x4000U/4U && + (blk*0x4000U + i*4U) < img_size; i++) { + csum = (csum + (uint32_t)fw[(blk*0x4000U)/4 + i]) + & 0xFFFFFFFFU; + } + wolfBoot_printf(" body 0x%05x csum=%08x\n", + (unsigned)(blk*0x4000U), csum); + } + } + } +#endif + /* mark image as no longer external */ img->not_ext = 1; @@ -250,6 +349,12 @@ void RAMFUNCTION wolfBoot_start(void) uint8_t *dts_addr = NULL; uint32_t dts_size = 0; #endif +#if defined(WOLFBOOT_ZYNQMP_FSBL) && defined(MMU) + /* When wolfBoot is the FSBL, the boot FIT carries an "atf" (BL31) + * sub-image. If present, hand off to BL31 instead of jumping to the + * kernel directly. */ + uintptr_t bl31_entry = 0; +#endif #if !defined(ALLOW_DOWNGRADE) && defined(WOLFBOOT_FIXED_PARTITIONS) uint32_t boot_v = wolfBoot_current_firmware_version(); uint32_t update_v = wolfBoot_update_firmware_version(); @@ -528,6 +633,18 @@ void RAMFUNCTION wolfBoot_start(void) } load_address = new_load; } +#if defined(WOLFBOOT_ZYNQMP_FSBL) + /* Load BL31 (ARM Trusted Firmware) to its DDR exec address. Its entry + * point is the FIT `load`/`entry` address returned here. Optional: if + * absent, fall through to the normal direct boot. */ + { + void *atf_load = fit_load_image(fit, "atf", NULL); + if (atf_load != NULL) { + bl31_entry = (uintptr_t)atf_load; + wolfBoot_printf("FIT: BL31 (atf) loaded at %p\n", atf_load); + } + } +#endif if (flat_dt != NULL) { uint8_t *dts_ptr = fit_load_image(fit, flat_dt, (int*)&dts_size); if (dts_ptr != NULL && wolfBoot_get_dts_size(dts_ptr) >= 0) { @@ -605,6 +722,17 @@ void RAMFUNCTION wolfBoot_start(void) #ifndef WOLFBOOT_SKIP_BOOT_VERIFY PART_SANITY_CHECK(&os_image); #endif +#if defined(WOLFBOOT_ZYNQMP_FSBL) && defined(MMU) + if (bl31_entry != 0) { + /* Hand off to BL31 (resident EL3 monitor). BL31 starts the kernel + * (BL33) at EL2; the DTB is forwarded via PMU_GLOBAL scratch (see + * hal/zynqmp_atf.c). Does not return. */ + wolfBoot_printf("Handing off to BL31 at 0x%x (kernel 0x%x)\n", + (uint32_t)bl31_entry, (uint32_t)(uintptr_t)load_address); + zynqmp_atf_handoff(bl31_entry, (uintptr_t)load_address, + (uintptr_t)dts_addr, ZYNQMP_ATF_EL2); + } +#endif #ifdef MMU do_boot((uint32_t*)load_address, (uint32_t*)dts_addr); diff --git a/tools/scripts/zcu102/tf-a-zynqmp-wolfboot-dtb.patch b/tools/scripts/zcu102/tf-a-zynqmp-wolfboot-dtb.patch new file mode 100644 index 0000000000..07477b48a4 --- /dev/null +++ b/tools/scripts/zcu102/tf-a-zynqmp-wolfboot-dtb.patch @@ -0,0 +1,59 @@ +plat: xilinx: zynqmp: forward DTB to a direct-kernel BL33 + +When wolfBoot replaces the Xilinx FSBL and boots a Linux kernel directly as +BL33 (no U-Boot), BL31 must hand the kernel its device tree in x0 (the arm64 +boot protocol requires the DTB pointer in x0). Stock ZynqMP TF-A leaves the +BL33 entry arguments zeroed, which is fine for a U-Boot BL33 but not for a +direct kernel. + +wolfBoot publishes the DTB address in PMU_GLOBAL.GLOBAL_GEN_STORAGE5 (it already +uses GLOBAL_GEN_STORAGE6 for the standard ATF handoff-parameter block). This +patch reads GEN_STORAGE5 and, when non-zero, places it in BL33's arg0. When the +register is zero (JTAG/default path, or a U-Boot BL33), behaviour is unchanged. + +Apply against the Xilinx ARM Trusted Firmware tree: + cd arm-trusted-firmware + git apply /path/to/tf-a-zynqmp-wolfboot-dtb.patch + make PLAT=zynqmp RESET_TO_BL31=1 bl31 + +Build BL31 with its link base in DDR (ZYNQMP_ATF_MEM_BASE in DDR), not the +default OCM 0xFFFE0000, because wolfBoot occupies OCM as the FSBL. + +diff --git a/plat/xilinx/zynqmp/bl31_zynqmp_setup.c b/plat/xilinx/zynqmp/bl31_zynqmp_setup.c +index 65616e599..b9a4284e3 100644 +--- a/plat/xilinx/zynqmp/bl31_zynqmp_setup.c ++++ b/plat/xilinx/zynqmp/bl31_zynqmp_setup.c +@@ -143,6 +143,21 @@ void bl31_early_platform_setup2(u_register_t arg0, u_register_t arg1, + panic(); + } + } ++ ++ /* ++ * wolfBoot (as the FSBL) publishes the BL33 device tree address in ++ * PMU_GLOBAL_GEN_STORAGE5. The arm64 boot protocol requires the DTB ++ * pointer in x0, but stock TF-A leaves BL33's args zeroed (fine for a ++ * U-Boot BL33, not for a direct kernel BL33). Forward it when set. ++ */ ++ { ++ uint64_t dtb_addr = (uint64_t)mmio_read_32(PMU_GLOBAL_GEN_STORAGE5); ++ ++ if (dtb_addr != 0U) { ++ bl33_image_ep_info.args.arg0 = dtb_addr; ++ } ++ } ++ + if (bl32_image_ep_info.pc != 0U) { + NOTICE("BL31: Secure code at 0x%lx\n", bl32_image_ep_info.pc); + } +diff --git a/plat/xilinx/zynqmp/include/zynqmp_def.h b/plat/xilinx/zynqmp/include/zynqmp_def.h +index cd3bbbc64..19156a5fb 100644 +--- a/plat/xilinx/zynqmp/include/zynqmp_def.h ++++ b/plat/xilinx/zynqmp/include/zynqmp_def.h +@@ -102,6 +102,7 @@ + #define PMU_GLOBAL_BASE U(0xFFD80000) + #define PMU_GLOBAL_CNTRL (PMU_GLOBAL_BASE + 0) + #define PMU_GLOBAL_GEN_STORAGE6 (PMU_GLOBAL_BASE + U(0x48)) ++#define PMU_GLOBAL_GEN_STORAGE5 (PMU_GLOBAL_BASE + U(0x44)) + #define PMU_GLOBAL_REQ_PWRUP_STATUS (PMU_GLOBAL_BASE + U(0x110)) + #define PMU_GLOBAL_REQ_PWRUP_EN (PMU_GLOBAL_BASE + U(0x118)) + #define PMU_GLOBAL_REQ_PWRUP_DIS (PMU_GLOBAL_BASE + U(0x11c)) diff --git a/tools/scripts/zcu102/zcu102-ca53-qspi.cmm b/tools/scripts/zcu102/zcu102-ca53-qspi.cmm index e6049d7f73..b374fc0738 100644 --- a/tools/scripts/zcu102/zcu102-ca53-qspi.cmm +++ b/tools/scripts/zcu102/zcu102-ca53-qspi.cmm @@ -127,13 +127,20 @@ if &programnow FLASHFILE.ReProgram off ) -; Flash signed application image at partition offset -DIALOG.YESNO "Flash signed application image now?" +; Flash signed FIT image at the wolfBoot boot partition offset. +; Offset must match WOLFBOOT_PARTITION_BOOT_ADDRESS in the active .config +; (config/examples/zynqmp_fsbl.config = 0x800000). This is the combined +; dual-parallel QSPI linear offset, same address space as BOOT.BIN@0x0. +; The signed FIT is produced by: +; mkimage -f hal/zynqmp_fsbl.its fitImage +; ./tools/keytools/sign --rsa4096 --sha3 fitImage .der 1 +; and must sit in the TRACE32 working directory (alongside BOOT.BIN). +DIALOG.YESNO "Flash signed FIT image (fitImage_v1_signed.bin) now?" ENTRY &programnow if &programnow ( FLASHFILE.ReProgram ALL - FLASHFILE.Load "test-app/image_v1_signed.bin" 0x7000000 + FLASHFILE.Load "fitImage_v1_signed.bin" 0x800000 FLASHFILE.ReProgram off ) diff --git a/tools/scripts/zcu102/zynqmp_wolfboot_fsbl.bif b/tools/scripts/zcu102/zynqmp_wolfboot_fsbl.bif new file mode 100644 index 0000000000..aae83d072b --- /dev/null +++ b/tools/scripts/zcu102/zynqmp_wolfboot_fsbl.bif @@ -0,0 +1,23 @@ +// Boot BIF for wolfBoot as the ZynqMP FSBL REPLACEMENT (ZCU102). +// +// wolfBoot is the [bootloader] partition: the BootROM loads it into OCM and +// enters it at EL3. There is NO zynqmp_fsbl.elf. PMUFW is still loaded by the +// BootROM via [pmufw_image]. wolfBoot itself runs psu_init() and then loads + +// verifies the downstream images (BL31, kernel, DTB) from its own QSPI/SD +// partitions using wolfBoot signing keys, so they are NOT listed here. +// +// The same image works for QSPI and SD boot; the boot device is selected by +// the ZCU102 boot-mode pins (SW6). See README for deployment. +// +// Usage: +// bootgen -arch zynqmp -image tools/scripts/zcu102/zynqmp_wolfboot_fsbl.bif \ +// -w -o BOOT.BIN +// +// Required files (copy next to this BIF, or use full paths): +// wolfboot.elf - wolfBoot, linked at OCM 0xFFFC0000 (built ZYNQMP_FSBL=1) +// pmufw.elf - Platform Management Unit firmware (from PetaLinux/Vitis) +the_ROM_image: +{ + [bootloader, destination_cpu=a53-0] wolfboot.elf + [pmufw_image] pmufw.elf +} diff --git a/tools/scripts/zcu102/zynqmp_wolfboot_fsbl_auth.bif b/tools/scripts/zcu102/zynqmp_wolfboot_fsbl_auth.bif new file mode 100644 index 0000000000..838be3ff0e --- /dev/null +++ b/tools/scripts/zcu102/zynqmp_wolfboot_fsbl_auth.bif @@ -0,0 +1,41 @@ +// Boot BIF for wolfBoot as the ZynqMP FSBL REPLACEMENT (ZCU102) with Xilinx +// hardware root-of-trust (RSA authentication via eFuse PPK). +// +// Same layout as zynqmp_wolfboot_fsbl.bif but the BootROM authenticates +// wolfBoot (and PMUFW) with RSA-4096/SHA3-384 before running them. This is the +// hardware root of trust: the BootROM verifies wolfBoot against the eFuse PPK +// hash; wolfBoot then verifies the downstream images with its OWN keys. +// +// Bring-up vs production: +// - [fsbl_config] bh_auth_enable below makes the BootROM trust the PPK in the +// boot header WITHOUT checking it against the eFuse PPK hash. Use this for +// development only. Remove it for production and program the eFuse PPK0 +// hash + blow RSA_EN so the BootROM enforces the check. +// - Generate the PPK eFuse hash: bootgen -arch zynqmp -efuseppkbits ... +// - Program eFuses with the xilskey programming example. +// +// Usage: +// bootgen -arch zynqmp \ +// -image tools/scripts/zcu102/zynqmp_wolfboot_fsbl_auth.bif \ +// -w -o BOOT.BIN +// +// Required files: +// wolfboot.elf - wolfBoot linked at OCM 0xFFFC0000 (built ZYNQMP_FSBL=1) +// pmufw.elf - Platform Management Unit firmware +// pskf.pem - primary secret (private) key +// sskf.pem - secondary secret (private) key +the_ROM_image: +{ + // Development only: trust the boot-header PPK without the eFuse check. + // Remove for production (program the eFuse PPK0 hash + RSA_EN instead). + [fsbl_config] bh_auth_enable + + // Primary public key 0, secondary public key id 0 + [auth_params] ppk_select=0; spk_id=0x00000000 + + [pskfile] pskf.pem + [sskfile] sskf.pem + + [bootloader, destination_cpu=a53-0, authentication=rsa] wolfboot.elf + [pmufw_image, authentication=rsa] pmufw.elf +}