/* * RAS driver for T194 * * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. */ #include #include #include #include #include #include #include #include static LIST_HEAD(core_ras_list); static DEFINE_RAW_SPINLOCK(core_ras_lock); static LIST_HEAD(corecluster_ras_list); static DEFINE_RAW_SPINLOCK(corecluster_ras_lock); static LIST_HEAD(ccplex_ras_list); static DEFINE_RAW_SPINLOCK(ccplex_ras_lock); static struct dentry *debugfs_dir; static struct dentry *debugfs_node; static int is_debug; /* saved hotplug state */ static enum cpuhp_state hp_state; /* Error Records per CORE - IFU errors * error_code = value of ARM_ERR_STATUS:IERR[15:8] */ static struct ras_error ifu_errors[] = { {.name = "IMQ Data Parity", .error_code = 0x08}, {.name = "L2 I$ Fetch Uncorrectable", .error_code = 0x07}, {.name = "I$ Tag Parity Snoop", .error_code = 0x06}, {.name = "I$ Multi-Hit Snoop", .error_code = 0x05}, {.name = "ITLB Parity", .error_code = 0x04}, {.name = "Trace Hash Error", .error_code = 0x03}, {.name = "I$ Data Parity", .error_code = 0x02}, {.name = "I$ Tag Parity", .error_code = 0x01}, {.name = "I$ Multi-Hit", .error_code = 0x0F}, {} }; /* Error Records per CORE - RET JSR errors */ static struct ras_error ret_jsr_errors[] = { {.name = "FRF Parity", .error_code = 0x13}, {.name = "IRF Parity", .error_code = 0x12}, {.name = "Garbage Bundle", .error_code = 0x11}, {.name = "Bundle Completion Timeout", .error_code = 0x10}, {} }; /* Error Records per CORE - MTS JSR errors */ static struct ras_error mts_jsr_errors[] = { {.name = "CTU MMIO Region", .error_code = 0x25}, {.name = "MTS MMCRAB Region Access", .error_code = 0x24}, {.name = "MTS_CARVEOUT Access from ARM SW", .error_code = 0x23}, {.name = "NAFLL PLL Failure to Lock", .error_code = 0x22}, {.name = "Internal Correctable MTS Error", .error_code = 0x21}, {.name = "Internal Uncorrectable MTS Error", .error_code = 0x20}, {} }; /* Error Records per CORE - LSD_1/LSD_STQ errors */ static struct ras_error lsd_1_errors[] = { {.name = "Coherent Cache Data Store Multi-Line ECC", .error_code = 0x39}, {.name = "Coherent Cache Data Store Uncorrectable ECC", .error_code = 0x38}, {.name = "Coherent Cache Data Store Correctable ECC", .error_code = 0x37}, {.name = "Coherent Cache Data Load Uncorrectable ECC", .error_code = 0x36}, {.name = "Coherent Cache Data Load Correctable ECC", .error_code = 0x35}, {.name = "Coherent Cache Multi-Hit", .error_code = 0x33}, {.name = "Coherent Cache Tag Store Parity", .error_code = 0x31}, {.name = "Coherent Cache Tag Load Parity", .error_code = 0x30}, {} }; /* Error Records per CORE - LSD_2/LSD_ECC errors */ static struct ras_error lsd_2_errors[] = { {.name = "BTU Copy Mini-Cache PPN Multi-Hit Error", .error_code = 0x49}, {.name = "Coherent Cache Data Uncorrectable ECC", .error_code = 0x47}, {.name = "Coherent Cache Data Correctable ECC", .error_code = 0x46}, {.name = "Version Cache Byte-Enable Parity", .error_code = 0x45}, {.name = "Version Cache Data Uncorrectable ECC", .error_code = 0x44}, {.name = "Version Cache Data Correctable ECC", .error_code = 0x43}, {.name = "BTU Copy Coherent Cache PPN Parity", .error_code = 0x41}, {.name = "BTU Copy Coherent Cache VPN Parity", .error_code = 0x40}, {} }; /* Error Records per CORE - LSD_3/LSD_L1HPF errors */ static struct ras_error lsd_3_errors[] = { {.name = "L2 TLB Parity Error", .error_code = 0xE0}, {} }; /* Error Records per CORE */ static struct error_record core_ers[] = { {.name = "IFU", .errx = 0, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_IFU_ICMH_ERR | ERR_CTL_IFU_ICTP_ERR | ERR_CTL_IFU_ICDP_ERR | ERR_CTL_IFU_THERR_ERR | ERR_CTL_IFU_ITLBP_ERR | ERR_CTL_IFU_ICMHSNP_ERR | ERR_CTL_IFU_ICTPSNP_ERR | ERR_CTL_IFU_L2UC_ERR | ERR_CTL_IFU_IMQDP_ERR, .errors = ifu_errors}, {.name = "RET_JSR", .errx = 1, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | ERR_CTL_RET_JSR_TO_ERR | ERR_CTL_RET_JSR_GB_ERR | ERR_CTL_RET_JSR_IRFP_ERR | ERR_CTL_RET_JSR_FRFP_ERR, .errors = ret_jsr_errors}, {.name = "MTS_JSR", .errx = 2, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_MTS_JSR_ERRUC_ERR | ERR_CTL_MTS_JSR_ERRC_ERR | ERR_CTL_MTS_JSR_NAFLL_ERR | ERR_CTL_MTS_JSR_CARVE_ERR | ERR_CTL_MTS_JSR_CRAB_ERR | ERR_CTL_MTS_JSR_MMIO_ERR, .errors = mts_jsr_errors}, {.name = "LSD_STQ", .errx = 3, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_LSD1_CCTLP_ERR | ERR_CTL_LSD1_CCTSP_ERR | ERR_CTL_LSD1_CCMH_ERR | ERR_CTL_LSD1_CCDLECC_S_ERR | ERR_CTL_LSD1_CCDLECC_D_ERR | ERR_CTL_LSD1_CCDSECC_S_ERR | ERR_CTL_LSD1_CCDSECC_D_ERR | ERR_CTL_LSD1_CCDSMLECC_ERR, .errors = lsd_1_errors}, {.name = "LSD_DCC", .errx = 4, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_LSD2_BTCCVPP_ERR | ERR_CTL_LSD2_BTCCPPP_ERR | ERR_CTL_LSD2_VRCDECC_S_ERR | ERR_CTL_LSD2_VRCDECC_D_ERR | ERR_CTL_LSD2_BTMCMH_ERR | ERR_CTL_LSD2_VRCBP_ERR | ERR_CTL_LSD2_CCDEECC_S_ERR | ERR_CTL_LSD2_CCDEECC_D_ERR, .errors = lsd_2_errors}, {.name = "LSD_L1HPF", .errx = 5, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_LSD3_L2TLBP_ERR, .errors = lsd_3_errors}, {} }; /* Error Records per CORE CLUSTER - L2 errors * error_code = value of ARM_ERR_STATUS:IERR[15:8] */ static struct ras_error l2_errors[] = { {.name = "URT Timeout", .error_code = 0x68}, {.name = "L2 Protocol Violation", .error_code = 0x67}, {.name = "SCF to L2 Slave Error Read", .error_code = 0x66}, {.name = "SCF to L2 Slave Error Write", .error_code = 0x65}, {.name = "SCF to L2 Decode Error Read", .error_code = 0x64}, {.name = "SCF to L2 Decode Error Write", .error_code = 0x63}, {.name = "SCF to L2 Request Response Interface Parity Errors", .error_code = 0x62}, {.name = "SCF to L2 Advance notice interface parity errors", .error_code = 0x61}, {.name = "SCF to L2 Filldata Parity Errors", .error_code = 0x60}, {.name = "SCF to L2 UnCorrectable ECC Data Error on interface", .error_code = 0x5F}, {.name = "SCF to L2 Correctable ECC Data Error on interface", .error_code = 0x5E}, {.name = "Core 1 to L2 Parity Error", .error_code = 0x5D}, {.name = "Core 0 to L2 Parity Error", .error_code = 0x5C}, {.name = "L2 Multi-Hit", .error_code = 0x5B}, {.name = "L2 URT Tag Parity Error", .error_code = 0x5A}, {.name = "L2 NTT Tag Parity Error", .error_code = 0x59}, {.name = "L2 MLT Tag Parity Error", .error_code = 0x58}, {.name = "L2 URD Data", .error_code = 0x57}, {.name = "L2 NTP Data", .error_code = 0x56}, {.name = "L2 MLC Uncorrectable Clean", .error_code = 0x54}, {.name = "L2 URD Uncorrectable", .error_code = 0x53}, {.name = "L2 MLC Uncorrectable Dirty", .error_code = 0x52}, {.name = "L2 URD Correctable Error", .error_code = 0x51}, {.name = "L2 MLC Correctable Error", .error_code = 0x50}, {} }; /* Error Records per CORE CLUSTER - MMU errors */ static struct ras_error mmu_errors[] = { {.name = "Walker Cache Parity Error", .error_code = 0xE9}, {.name = "A$ Parity Error", .error_code = 0xE8}, {} }; /* Error Records per CORE CLUSTER - Cluster Clocks errors */ static struct ras_error cluster_clocks_errors[] = { {.name = "Frequency Monitor Error", .error_code = 0xE4}, {} }; /* Error Records per CORE CLUSTER */ static struct error_record corecluster_ers[] = { {.name = "L2", .errx = 0, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_L2_MLD_ECCC_ERR | ERR_CTL_L2_URD_ECCC_ERR | ERR_CTL_L2_MLD_ECCUD_ERR | ERR_CTL_L2_URD_ECCU_ERR | ERR_CTL_L2_MLD_ECCUC_ERR | ERR_CTL_L2_NTDP_ERR | ERR_CTL_L2_URDP | ERR_CTL_L2_MLTP_ERR | ERR_CTL_L2_NTTP_ERR | ERR_CTL_L2_URTP_ERR | ERR_CTL_L2_L2MH_ERR | ERR_CTL_L2_CORE02L2CP_ERR | ERR_CTL_L2_CORE12L2CP_ERR | ERR_CTL_L2_SCF2L2C_ECCC_ERR | ERR_CTL_L2_SCF2L2C_ECCU_ERR | ERR_CTL_L2_SCF2L2C_FILLDATAP_ERR | ERR_CTL_L2_SCF2L2C_ADVNOTP_ERR | ERR_CTL_L2_SCF2L2C_REQRSPP_ERR | ERR_CTL_L2_SCF2L2C_DECWTERR_ERR | ERR_CTL_L2_SCF2L2C_DECRDERR_ERR | ERR_CTL_L2_SCF2L2C_SLVWTERR_ERR | ERR_CTL_L2_SCF2L2C_SLVRDERR_ERR | ERR_CTL_L2_L2PCL_ERR | ERR_CTL_L2_URTTO_ERR, .errors = l2_errors}, {.name = "CLUSTER_CLOCKS", .errx = 1, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | ERR_CTL_CC_FREQ_MON_ERR, .errors = cluster_clocks_errors}, {.name = "MMU", .errx = 2, .err_ctrl = RAS_CTL_ED | RAS_CTL_CFI | ERR_CTL_MMU_ACPERR_ERR | ERR_CTL_MMU_WCPERR_ERR, .errors = mmu_errors}, {} }; /* Error Records per CCPLEX - CMU:CCPMU errors * error_code = value of ARM_ERR_STATUS:IERR[15:8] */ static struct ras_error cmu_ccpmu_errors[] = { {.name = "MCE Ucode Error", .error_code = 0x84}, {.name = "MCE IL1 Parity Error", .error_code = 0x83}, {.name = "MCE Timeout Error", .error_code = 0x82}, {.name = "CRAB Access Error", .error_code = 0x81}, {.name = "MCE Memory Access Error", .error_code = 0x80}, {} }; /* Error Records per CCPLEX - SCF:IOB errors */ static struct ras_error scf_iob_errors[] = { {.name = "Request parity error", .error_code = 0x99}, {.name = "Putdata parity error", .error_code = 0x98}, {.name = "Uncorrectable ECC on Putdata", .error_code = 0x97}, {.name = "CBB Interface Error", .error_code = 0x96}, {.name = "MMCRAB Error", .error_code = 0x95}, {.name = "IHI Interface Error", .error_code = 0x94}, {.name = "CRI Error", .error_code = 0x93}, {.name = "TBX Interface Error", .error_code = 0x92}, {.name = "EVP Interface Error", .error_code = 0x91}, {.name = "Correctable ECC on Putdata", .error_code = 0x90}, {} }; /* Error Records per CCPLEX - SCF:SNOC errors */ static struct ras_error scf_snoc_errors[] = { {.name = "Misc Client Parity Error", .error_code = 0xAA}, {.name = "Misc Filldata Parity Error", .error_code = 0xA9}, {.name = "Uncorrectable ECC Misc Client", .error_code = 0xA8}, {.name = "DVMU Interface Parity Error", .error_code = 0xA7}, {.name = "DVMU Interface Timeout Error", .error_code = 0xA6}, {.name = "CPE Request Error", .error_code = 0xA5}, {.name = "CPE Response Error", .error_code = 0xA4}, {.name = "CPE Timeout Error", .error_code = 0xA3}, {.name = "Uncorrectable Carveout Error", .error_code = 0xA2}, {.name = "Correctable ECC Misc Client", .error_code = 0xA1}, {.name = "Correctable Carveout Error", .error_code = 0xA0}, {} }; /* Error Records per CCPLEX - SCF:CTU errors */ static struct ras_error cmu_ctu_errors[] = { {.name = "Timeout error for TRC_DMA request", .error_code = 0xB7}, {.name = "Timeout error for CTU Snp", .error_code = 0xB6}, {.name = "Parity error in CTU TAG RAM", .error_code = 0xB5}, {.name = "Parity error in CTU DATA RAM", .error_code = 0xB3}, {.name = "Parity error for Cluster Rsp", .error_code = 0xB4}, {.name = "Parity error for TRL requests from 9 agents", .error_code = 0xB2}, {.name = "Parity error for MCF request", .error_code = 0xB1}, {.name = "TRC DMA fillsnoop parity error", .error_code = 0xB0}, {} }; /* Error Records per CCPLEX - SCF:L3_* errors */ static struct ras_error scf_l3_errors[] = { {.name = "L3 Correctable ECC Error", .error_code = 0x7C}, {.name = "SNOC Interface Parity Error", .error_code = 0x7B}, {.name = "MCF Interface Parity Error", .error_code = 0x7A}, {.name = "L3 Tag Parity Error", .error_code = 0x79}, {.name = "L3 Dir Parity Error", .error_code = 0x78}, {.name = "L3 Uncorrectable ECC Error", .error_code = 0x77}, {.name = "Multi-Hit CAM Error", .error_code = 0x75}, {.name = "Multi-Hit Tag Error", .error_code = 0x74}, {.name = "Unrecognized Command Error", .error_code = 0x73}, {.name = "L3 Protocol Error", .error_code = 0x72}, {} }; /* Error Records per CCPLEX - CMU_Clocks errors */ static struct ras_error scfcmu_clocks_errors[] = { {.name = "Cluster 3 frequency monitor error", .error_code = 0xC7}, {.name = "Cluster 2 frequency monitor error", .error_code = 0xC6}, {.name = "Cluster 1 frequency monitor error", .error_code = 0xC5}, {.name = "Cluster 0 frequency monitor error", .error_code = 0xC3}, {.name = "Voltage error on ADC1 Monitored Logic", .error_code = 0xC4}, {.name = "Voltage error on ADC0 Monitored Logic", .error_code = 0xC2}, {.name = "Lookup Table 1 Parity Error", .error_code = 0xC1}, {.name = "Lookup Table 0 Parity Error", .error_code = 0xC0}, {} }; /* Error Records per CCPLEX */ static struct error_record ccplex_ers[] = { {.name = "CMU:CCPMU", .errx = 1024, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | ERR_CTL_DPMU_DMCE_CRAB_ACC_ERR | ERR_CTL_DPMU_CRAB_ACC_ERR | ERR_CTL_DPMU_DMCE_IL1_PAR_ERR | ERR_CTL_DPMU_DMCE_TIMEOUT_ERR | ERR_CTL_DPMU_DMCE_UCODE_ERR, .errors = cmu_ccpmu_errors}, {.name = "SCF:IOB", .errx = 1025, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_SCFIOB_REQ_PAR_ERR | ERR_CTL_SCFIOB_PUT_PAR_ERR | ERR_CTL_SCFIOB_PUT_CECC_ERR | ERR_CTL_SCFIOB_PUT_UECC_ERR | ERR_CTL_SCFIOB_EVP_ERR | ERR_CTL_SCFIOB_TBX_ERR | ERR_CTL_SCFIOB_CRI_ERR | ERR_CTL_SCFIOB_MMCRAB_ERR | ERR_CTL_SCFIOB_IHI_ERR | ERR_CTL_SCFIOB_CBB_ERR, .errors = scf_iob_errors}, {.name = "SCF:SNOC", .errx = 1026, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_SCFSNOC_CPE_TO_ERR | ERR_CTL_SCFSNOC_CPE_RSP_ERR | ERR_CTL_SCFSNOC_CPE_REQ_ERR | ERR_CTL_SCFSNOC_DVMU_TO_ERR | ERR_CTL_SCFSNOC_DVMU_PAR_ERR | ERR_CTL_SCFSNOC_MISC_CECC_ERR | ERR_CTL_SCFSNOC_MISC_UECC_ERR | ERR_CTL_SCFSNOC_MISC_PAR_ERR | ERR_CTL_SCFSNOC_MISC_RSP_ERR | ERR_CTL_SCFSNOC_CARVEOUT_ERR | ERR_CTL_SCFSNOC_CARVEOUT_CECC_ERR, .errors = scf_snoc_errors}, {.name = "CMU:CTU", .errx = 1027, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | ERR_CTL_CMUCTU_TRCDMA_PAR_ERR | ERR_CTL_CMUCTU_MCF_PAR_ERR | ERR_CTL_CMUCTU_TRL_PAR_ERR | ERR_CTL_CMUCTU_CTU_DATA_PAR_ERR | ERR_CTL_CMUCTU_TAG_PAR_ERR | ERR_CTL_CMUCTU_CTU_SNP_ERR | ERR_CTL_CMUCTU_TRCDMA_REQ_ERR | ERR_CTL_CMUCTU_RSP_PAR_ERR, .errors = cmu_ctu_errors}, {.name = "SCF:L3_0", .errx = 768, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_SCFL3_CECC_ERR | ERR_CTL_SCFL3_SNOC_INTFC_ERR | ERR_CTL_SCFL3_MCF_INTFC_ERR | ERR_CTL_SCFL3_TAG_ERR | ERR_CTL_SCFL3_L2DIR_ERR | ERR_CTL_SCFL3_UECC_ERR | ERR_CTL_SCFL3_MH_CAM_ERR | ERR_CTL_SCFL3_MH_TAG_ERR | ERR_CTL_SCFL3_UNSUPP_REQ_ERR | ERR_CTL_SCFL3_PROT_ERR, .errors = scf_l3_errors}, {.name = "SCF:L3_1", .errx = 769, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_SCFL3_CECC_ERR | ERR_CTL_SCFL3_SNOC_INTFC_ERR | ERR_CTL_SCFL3_MCF_INTFC_ERR | ERR_CTL_SCFL3_TAG_ERR | ERR_CTL_SCFL3_L2DIR_ERR | ERR_CTL_SCFL3_UECC_ERR | ERR_CTL_SCFL3_MH_CAM_ERR | ERR_CTL_SCFL3_MH_TAG_ERR | ERR_CTL_SCFL3_UNSUPP_REQ_ERR | ERR_CTL_SCFL3_PROT_ERR, .errors = scf_l3_errors}, {.name = "SCF:L3_2", .errx = 770, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_SCFL3_CECC_ERR | ERR_CTL_SCFL3_SNOC_INTFC_ERR | ERR_CTL_SCFL3_MCF_INTFC_ERR | ERR_CTL_SCFL3_TAG_ERR | ERR_CTL_SCFL3_L2DIR_ERR | ERR_CTL_SCFL3_UECC_ERR | ERR_CTL_SCFL3_MH_CAM_ERR | ERR_CTL_SCFL3_MH_TAG_ERR | ERR_CTL_SCFL3_UNSUPP_REQ_ERR | ERR_CTL_SCFL3_PROT_ERR, .errors = scf_l3_errors}, {.name = "SCF:L3_3", .errx = 771, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | RAS_CTL_CFI | ERR_CTL_SCFL3_CECC_ERR | ERR_CTL_SCFL3_SNOC_INTFC_ERR | ERR_CTL_SCFL3_MCF_INTFC_ERR | ERR_CTL_SCFL3_TAG_ERR | ERR_CTL_SCFL3_L2DIR_ERR | ERR_CTL_SCFL3_UECC_ERR | ERR_CTL_SCFL3_MH_CAM_ERR | ERR_CTL_SCFL3_MH_TAG_ERR | ERR_CTL_SCFL3_UNSUPP_REQ_ERR | ERR_CTL_SCFL3_PROT_ERR, .errors = scf_l3_errors}, {.name = "SCFCMU_CLOCKS", .errx = 1028, .err_ctrl = RAS_CTL_ED | RAS_CTL_UE | ERR_CTL_SCFCMU_LUT0_PAR_ERR | ERR_CTL_SCFCMU_LUT1_PAR_ERR | ERR_CTL_SCFCMU_ADC0_MON_ERR | ERR_CTL_SCFCMU_ADC1_MON_ERR | ERR_CTL_SCFCMU_FREQ0_MON_ERR | ERR_CTL_SCFCMU_FREQ1_MON_ERR | ERR_CTL_SCFCMU_FREQ2_MON_ERR | ERR_CTL_SCFCMU_FREQ3_MON_ERR, .errors = scfcmu_clocks_errors}, {} }; static struct tegra_ras_impl_err_bit t194_ras_impl_err_bit[] = { {0xFF, ERR_CTL_IFU_ICDP_ERR}, /*IFU*/ {ERR_CTL_RET_JSR_GB_ERR, 0xFF}, /*JSR_RET*/ {ERR_CTL_MTS_JSR_CARVE_ERR, ERR_CTL_MTS_JSR_ERRC_ERR}, /*JSR_MTS*/ {ERR_CTL_LSD1_CCDSECC_D_ERR, ERR_CTL_LSD1_CCDSECC_S_ERR}, /*LSD_STQ*/ {ERR_CTL_LSD2_CCDEECC_D_ERR, ERR_CTL_LSD2_CCDEECC_S_ERR}, /*LSD_DCC*/ {0xFF, ERR_CTL_LSD3_L2TLBP_ERR}, /*LSD_L1HPF*/ {ERR_CTL_L2_L2PCL_ERR, ERR_CTL_L2_SCF2L2C_ECCC_ERR},/*L2*/ {ERR_CTL_CC_FREQ_MON_ERR, 0xFF}, /*Cluster_Clocks*/ {0xFF, ERR_CTL_MMU_WCPERR_ERR}, /*MMU*/ {ERR_CTL_SCFL3_PROT_ERR, ERR_CTL_SCFL3_CECC_ERR}, /*L3*/ {ERR_CTL_DPMU_DMCE_CRAB_ACC_ERR, 0xFF}, /*CCPMU*/ {ERR_CTL_SCFIOB_CBB_ERR, ERR_CTL_SCFIOB_PUT_CECC_ERR},/*SCF_IOB*/ {ERR_CTL_SCFSNOC_CPE_TO_ERR, ERR_CTL_SCFSNOC_MISC_CECC_ERR},/*SCFSNOC*/ {ERR_CTL_CMUCTU_MCF_PAR_ERR, 0xFF}, /*SCF_CTU*/ {ERR_CTL_SCFCMU_FREQ0_MON_ERR, 0xFF} /*CMU_Clocks*/ }; /* This is called for each online CPU during probe and is also used * as hotplug callback to enable RAS every time a core comes online */ static void carmel_ras_enable(void *info) { u64 errx; int i; u8 cpu = smp_processor_id(); /* Enable Core Error Records */ for (i = 0; core_ers[i].name; i++) { errx = (tegra18_logical_to_cluster(cpu) << 5) + (tegra18_logical_to_cpu(cpu) << 4) + core_ers[i].errx; ras_write_errselr(errx); ras_write_error_control(core_ers[i].err_ctrl); ras_read_error_control(); } /* Enable Core Cluster Error Records */ for (i = 0; corecluster_ers[i].name; i++) { errx = 512 + (tegra18_logical_to_cluster(cpu) << 4) + corecluster_ers[i].errx; ras_write_errselr(errx); ras_write_error_control(corecluster_ers[i].err_ctrl); ras_read_error_control(); } /* Enable CCPLEX Error Records */ for (i = 0; ccplex_ers[i].name; i++) { ras_write_errselr(ccplex_ers[i].errx); ras_write_error_control(ccplex_ers[i].err_ctrl); ras_read_error_control(); } pr_info("%s: RAS enabled on cpu%d\n", __func__, cpu); } static int carmel_ras_enable_callback(unsigned int cpu) { if (is_this_ras_cpu()) smp_call_function_single(cpu, carmel_ras_enable, NULL, 1); return 0; } /* SERROR is triggered for Uncorrectable errors. * This is SERR Callback for error records per core. * A core will scan all other core's per core error records */ static int ras_core_serr_callback(struct pt_regs *regs, int reason, unsigned int esr, void *priv) { u64 err_status; int cpu, errx; unsigned long flags; int retval = 1; struct error_record *record; if (!is_this_ras_cpu()) return retval; pr_info("%s: Scanning Core Error Records for Uncorrectable Errors\n", __func__); raw_spin_lock_irqsave(&core_ras_lock, flags); /* scan all CPU's per core error records */ for_each_online_cpu(cpu) { if (!tegra_is_cpu_carmel(cpu)) continue; list_for_each_entry(record, &core_ras_list, node) { errx = (tegra18_logical_to_cluster(cpu) << 5) + (tegra18_logical_to_cpu(cpu) << 4) + record->errx; ras_write_errselr(errx); err_status = ras_read_error_status(); if ((err_status & ERRi_STATUS_UE) && (err_status & ERRi_STATUS_VALID)) { print_error_record(record, err_status, errx); retval = 0; } } } raw_spin_unlock_irqrestore(&core_ras_lock, flags); return retval; } static struct serr_hook core_serr_callback = { .fn = ras_core_serr_callback }; static void register_core_er(struct error_record *record) { list_add(&record->node, &core_ras_list); } static void unregister_core_er(struct error_record *record) { list_del(&record->node); } static void ras_register_core_ers(void) { int i; for (i = 0; core_ers[i].name; i++) register_core_er(&core_ers[i]); } static void ras_unregister_core_ers(void) { int i; for (i = 0; core_ers[i].name; i++) unregister_core_er(&core_ers[i]); } /* * This is used to handle FHI or Correctable Errors triggered from * error records per core. */ static void handle_fhi_core(void) { u64 err_status; int cpu, errx; struct error_record *record; pr_info("%s: Scanning Core Error Records for Correctable Errors\n", __func__); /* scan all CPU's per core error records */ for_each_online_cpu(cpu) { if (!tegra_is_cpu_carmel(cpu)) continue; list_for_each_entry(record, &core_ras_list, node) { errx = (tegra18_logical_to_cluster(cpu) << 5) + (tegra18_logical_to_cpu(cpu) << 4) + record->errx; ras_write_errselr(errx); err_status = ras_read_error_status(); if (get_error_status_ce(err_status) && (err_status & ERRi_STATUS_VALID)) print_error_record(record, err_status, errx); } } } /* SERROR is triggered for Uncorrectable errors. * This is SERR Callback for error records per Core Cluster. */ static int ras_corecluster_serr_callback(struct pt_regs *regs, int reason, unsigned int esr, void *priv) { u64 err_status; int cpu, errx; unsigned long flags; int retval = 1; struct error_record *record; if (!is_this_ras_cpu()) return retval; pr_info("%s:Scanning CoreCluster Error Records for Uncorrectable " "Errors\n", __func__); raw_spin_lock_irqsave(&corecluster_ras_lock, flags); /* scan all CPU's per core error records */ for_each_online_cpu(cpu) { if (!tegra_is_cpu_carmel(cpu)) continue; list_for_each_entry(record, &corecluster_ras_list, node) { errx = 512 + (tegra18_logical_to_cluster(cpu) << 4) + record->errx; ras_write_errselr(errx); err_status = ras_read_error_status(); if ((err_status & ERRi_STATUS_UE) && (err_status & ERRi_STATUS_VALID)) { print_error_record(record, err_status, errx); retval = 0; } } } raw_spin_unlock_irqrestore(&corecluster_ras_lock, flags); return retval; } static struct serr_hook corecluster_serr_callback = { .fn = ras_corecluster_serr_callback }; static void register_corecluster_er(struct error_record *record) { list_add(&record->node, &corecluster_ras_list); } static void unregister_corecluster_er(struct error_record *record) { list_del(&record->node); } static void ras_register_corecluster_ers(void) { int i; for (i = 0; corecluster_ers[i].name; i++) register_corecluster_er(&corecluster_ers[i]); } static void ras_unregister_corecluster_ers(void) { int i; for (i = 0; corecluster_ers[i].name; i++) unregister_corecluster_er(&corecluster_ers[i]); } /* This is used to handle FHI or Correctable Errors * triggered from error records per Core Cluster */ static void handle_fhi_corecluster(void) { u64 err_status; int cpu, errx; struct error_record *record; pr_info("%s:Scanning CoreCluster Error Records for Correctable Errors\n", __func__); for_each_online_cpu(cpu) { if (!tegra_is_cpu_carmel(cpu)) continue; list_for_each_entry(record, &corecluster_ras_list, node) { errx = 512 + (tegra18_logical_to_cluster(cpu) << 4) + record->errx; ras_write_errselr(errx); err_status = ras_read_error_status(); if (get_error_status_ce(err_status) && (err_status & ERRi_STATUS_VALID)) print_error_record(record, err_status, errx); } } } /* SERROR is triggered for Uncorrectable errors. * This is SERR Callback for error records per CCPLEX. */ static int ras_ccplex_serr_callback(struct pt_regs *regs, int reason, unsigned int esr, void *priv) { u64 err_status; unsigned long flags; int retval = 1; struct error_record *record; /* Return if this CPU doesn't support RAS */ if (!is_this_ras_cpu()) return retval; pr_info("%s: Scanning CCPLEX Error Records for Uncorrectable Errors\n", __func__); raw_spin_lock_irqsave(&ccplex_ras_lock, flags); list_for_each_entry(record, &ccplex_ras_list, node) { ras_write_errselr(record->errx); err_status = ras_read_error_status(); if ((err_status & ERRi_STATUS_UE) && (err_status & ERRi_STATUS_VALID)) { print_error_record(record, err_status, record->errx); retval = 0; } } raw_spin_unlock_irqrestore(&ccplex_ras_lock, flags); return is_debug?1 : retval; } static struct serr_hook ccplex_serr_callback = { .fn = ras_ccplex_serr_callback }; static void register_ccplex_er(struct error_record *record) { list_add(&record->node, &ccplex_ras_list); } static void unregister_ccplex_er(struct error_record *record) { list_del(&record->node); } static void ras_register_ccplex_ers(void) { int i; for (i = 0; ccplex_ers[i].name; i++) register_ccplex_er(&ccplex_ers[i]); } static void ras_unregister_ccplex_ers(void) { int i; for (i = 0; ccplex_ers[i].name; i++) unregister_ccplex_er(&ccplex_ers[i]); } /* This is used to handle FHI or Correctable Errors * triggered from error records per CCPLEX. */ static void handle_fhi_ccplex(void) { u64 err_status; struct error_record *record; /* Return if RAS is not supported on this CPU */ if (!is_this_ras_cpu()) return; pr_info("%s: Scanning CCPLEX Error Records for Correctable Errors\n", __func__); list_for_each_entry(record, &ccplex_ras_list, node) { ras_write_errselr(record->errx); err_status = ras_read_error_status(); if (get_error_status_ce(err_status) && (err_status & ERRi_STATUS_VALID)) print_error_record(record, err_status, record->errx); } } /* FHI is triggered for Correctable errors. * This is FHI Callback for handling error records per core, * per core cluster and per CCPLEX */ static void carmel_fhi_callback(void) { handle_fhi_core(); handle_fhi_corecluster(); handle_fhi_ccplex(); } static struct ras_fhi_callback fhi_callback = { .fn = carmel_fhi_callback }; /* This function is used to trigger RAS Errors * depending upon the error record and error enabled * in the pfgctl passed to it */ static int ras_trip(u64 errx, u64 pfgctl) { unsigned long flags, err_ctl; flags = arch_local_save_flags(); /* Print some debug information */ pr_crit("%s: DAIF = 0x%lx\n", __func__, flags); if (flags & 0x4) { pr_crit("%s: \"A\" not set", __func__); return 0; } ras_write_errselr(errx); pr_info("%s: Error Record Selected = %lld\n", __func__, ras_read_errselr()); err_ctl = ras_read_error_control(); pr_crit("%s: Error Record ERRCTL = 0x%lx\n", __func__, err_ctl); if (!(err_ctl & RAS_CTL_ED)) { pr_crit("%s: Error Detection is not enabled", __func__); return 0; } /* Write some value to MISC0 */ ras_write_error_misc0(ERRi_MISC0_CONST); /* Write some value to MISC1 */ ras_write_error_misc1(ERRi_MISC1_CONST); /* Write some value to ADDR */ ras_write_error_addr(ERRi_ADDR_CONST); is_debug = 1; /* Set coundown value */ ras_write_pfg_cdn(ERRi_PFGCDN_CDN_1); /* Write to ERRPFGCTL */ pr_info("%s: Writing 0x%llx to ERRXPFGCTL\n", __func__, pfgctl); ras_write_pfg_control(pfgctl); return 0; } static int l3_cecc_put(void *data, u64 val) { return ras_trip(ERRX_SCFL3, val); } /* This will return the special value to be written to debugfs node * L3_0_CECC_ERR-trip to trigger L3_0_CECC Error * Value is written to PFGCTL register. * Enables bits CECC_ERR|CDNEN|MV|AV|CE|UC */ static int l3_cecc_get(void *data, u64 *val) { *val = ERRi_PFGCTL_UC | ERRi_PFGCTL_CE | ERRi_PFGCTL_CDNEN | ERR_CTL_SCFL3_CECC_ERR; return 0; } static int scf_iob_cecc_put(void *data, u64 val) { return ras_trip(ERRX_SCFIOB, val); } /* This will return the special value to be written to debugfs node * SCF_IOB-PUTDATA_CECC_ERR-trip to trigger SCF IOB PUTDATA_CECC Error */ static int scf_iob_cecc_get(void *data, u64 *val) { *val = ERRi_PFGCTL_UC | ERRi_PFGCTL_CE | ERRi_PFGCTL_CDNEN | ERR_CTL_SCFIOB_PUT_CECC_ERR; return 0; } static int scf_iob_cbb_put(void *data, u64 val) { return ras_trip(ERRX_SCFIOB, val); } /* This will return the special value to be written to debugfs node * SCF_IOB-CBB_ERR-trip to trigger SCF IOB CBB Error */ static int scf_iob_cbb_get(void *data, u64 *val) { *val = ERRi_PFGCTL_UC | ERRi_PFGCTL_CE | ERRi_PFGCTL_CDNEN | ERR_CTL_SCFIOB_CBB_ERR; return 0; } /* * Parse fields from input to use further for injecting RAS error. * These fields are used to get error record number which will be * used to select specific error record using ERRSELR_EL1 for * injecting error. * i/p field "val" format is "EEDDCCBBAA", where: * AA[00-07] - Unit * BB[08-15] - Error type(Corr is 0, UnCorr is 1) * CC[16-23] - Logical_CPU_ID * DD[24-31] - Logical_Cluster_ID * EE[32-39] - L3_Bank_ID */ static int ras_mca_get_record_errselr(u64 val, u64 *err_inject) { int unit = RAS_EXTRACT(val, 7, 0); int uncorr_err = RAS_EXTRACT(val, 15, 8); int Logical_CPU_ID = RAS_EXTRACT(val, 23, 16); int Logical_Cluster_ID = RAS_EXTRACT(val, 31, 24); int L3_Bank_ID = RAS_EXTRACT(val, 39, 32); *err_inject = ERRi_PFGCTL_UC | ERRi_PFGCTL_CE | ERRi_PFGCTL_CDNEN; pr_info("Unit:0x%x Err_type:%s Logical_CPUID:0x%x Logical_ClusterID:" "0x%x L3_BankID:0x%x\n", unit, uncorr_err?"UnCorr":"Corr", Logical_CPU_ID, Logical_Cluster_ID, L3_Bank_ID); if (uncorr_err) *err_inject |= t194_ras_impl_err_bit[unit].uncorr_bit; else *err_inject |= t194_ras_impl_err_bit[unit].corr_bit; switch (unit) { case IFU: return 0*256 + Logical_CPU_ID*16 + 0; case JSR_RET: return 0*256 + Logical_CPU_ID*16 + 1; case JSR_MTS: return 0*256 + Logical_CPU_ID*16 + 2; case LSD_STQ: return 0*256 + Logical_CPU_ID*16 + 3; case LSD_DCC: return 0*256 + Logical_CPU_ID*16 + 4; case LSD_L1HPF: return 0*256 + Logical_CPU_ID*16 + 5; case L2: return 2*256 + Logical_Cluster_ID*16 + 0; case Cluster_Clocks: return 2*256 + Logical_Cluster_ID*16 + 1; case MMU: return 2*256 + Logical_Cluster_ID*16 + 2; case L3: return 3*256 + L3_Bank_ID; case CCPMU: return 4*256 + 0; case SCF_IOB: return 4*256 + 1; case SCF_SNOC: return 4*256 + 2; case SCF_CTU: return 4*256 + 3; case CMU_Clocks: return 4*256 + 4; default: return 0xFF; } } /* * Print help for error injection and basic register info. */ static int ras_mca_get(void *data, u64 *val) { unsigned long errctl = ras_read_error_control(); *val = ras_read_pfg_control(); pr_info("ERXPFGCTL_EL1:0x%llx ERRCTLR:0x%lx\n", *val, errctl); pr_info("Please write data in below format to this node for " "injecting RAS error.\n\techo EEDDCCBBAA > RAS_MCA_ERR-trip\n" "where:\n\t" " EE[32-39] - L3_Bank_ID\n\t" " DD[24-31] - Logical_Cluster_ID\n\t" " CC[16-23] - Logical_CPU_ID\n\t" " BB[08-15] - Error type(Corr is 0, UnCorr is 1)\n\t" " AA[00-07] - Unit\n\t" " Unit values are:\n\t\t" "IFU:00\n\t\tJSR_RET:01\n\t\tJSR_MTS:02\n\t\tLSD_STQ:03\n\t\t" "LSD_DCC:04\n\t\tLSD_L1HPF:05\n\t\tL2:06\n\t\t" "Cluster_Clocks:07\n\t\tMMU:08\n\t\tL3:09\n\t\tCCPMU:0A\n\t\t" "SCF_IOB:0B\n\t\tSCF_SNOC:0C\n\t\tSCF_CTU:0D\n\t\t" "CMU_Clocks:0E\n\n" ); return 0; } /* * Read input(i/p) value and inject error based on value. */ static int ras_mca_put(void *data, u64 val) { int err_record_no = 0; u64 err_inject = 0; err_record_no = ras_mca_get_record_errselr(val, &err_inject); pr_info("Errx(ERRSELR_EL1):0x%x ERXPFGCTL_EL1:0x%llx PFGCTL_bits:" "0x%llx\n", err_record_no, ras_read_pfg_control(), err_inject); if (err_inject == 0xFF || err_record_no == 0xFF) pr_info("Invalid input.\n"); else return ras_trip(err_record_no, err_inject); return 0; } static int ras_mca_open(struct inode *inode, struct file *file) { return simple_attr_open(inode, file, ras_mca_get, ras_mca_put, "0x%08lx"); } static int scf_iob_cbb_open(struct inode *inode, struct file *file) { return simple_attr_open(inode, file, scf_iob_cbb_get, scf_iob_cbb_put, "0x%08lx"); } static int scf_iob_cecc_open(struct inode *inode, struct file *file) { return simple_attr_open(inode, file, scf_iob_cecc_get, scf_iob_cecc_put, "0x%08lx"); } static int l3_cecc_open(struct inode *inode, struct file *file) { return simple_attr_open(inode, file, l3_cecc_get, l3_cecc_put, "0x%08lx"); } static const struct file_operations fops_scf_iob_cbb = { .read = simple_attr_read, .write = simple_attr_write, .open = scf_iob_cbb_open, .llseek = noop_llseek, }; static const struct file_operations fops_scf_iob_cecc = { .read = simple_attr_read, .write = simple_attr_write, .open = scf_iob_cecc_open, .llseek = noop_llseek, }; static const struct file_operations fops_l3_cecc = { .read = simple_attr_read, .write = simple_attr_write, .open = l3_cecc_open, .llseek = noop_llseek, }; static const struct file_operations fops_ras_mca = { .read = simple_attr_read, .write = simple_attr_write, .open = ras_mca_open, .llseek = noop_llseek, }; static int ras_carmel_dbgfs_init(void) { /* Install debugfs nodes to test RAS */ debugfs_dir = debugfs_create_dir("carmel_ras", NULL); if (!debugfs_dir) { pr_err("Error creating carmel_ras debugfs dir.\n"); return -ENODEV; } debugfs_node = debugfs_create_file("SCF_IOB-CBB_ERR-trip", 0600, debugfs_dir, NULL, &fops_scf_iob_cbb); if (!debugfs_node) { pr_err("Error creating SCF_IOB-CBB_ERR-trip debugfs node.\n"); return -ENODEV; } debugfs_node = debugfs_create_file("SCF_IOB-PUTDATA_CECC_ERR-trip", 0600, debugfs_dir, NULL, &fops_scf_iob_cecc); if (!debugfs_node) { pr_err("Error creating SCF_IOB-PUTDATA_CECC_ERR-trip debugfs node.\n"); return -ENODEV; } debugfs_node = debugfs_create_file("L3_0_CECC_ERR-trip", 0600, debugfs_dir, NULL, &fops_l3_cecc); if (!debugfs_node) { pr_err("Error creating L3_0_CECC_ERR-trip debugfs node.\n"); return -ENODEV; } debugfs_node = debugfs_create_file("RAS_MCA_ERR-trip", 0600, debugfs_dir, NULL, &fops_ras_mca); if (!debugfs_node) { pr_err("Error creating L3_0_CECC_ERR-trip debugfs node.\n"); return -ENODEV; } return 0; } static int ras_carmel_probe(struct platform_device *pdev) { int cpu, do_init = 0, ret = -1; struct device *dev = &pdev->dev; if (!is_ras_ready()) { dev_info(dev, "Deferring probe, arm64_ras hasnt been probed yet"); return -EPROBE_DEFER; } /* probe only if RAS is supported on any of the online CPUs */ for_each_online_cpu(cpu) { if (tegra_is_cpu_carmel(cpu) && is_ras_cpu(cpu)) do_init = 1; } if (!do_init) { dev_info(dev, "None of the CPUs support RAS"); return 0; } ras_register_core_ers(); ras_register_corecluster_ers(); ras_register_ccplex_ers(); /* register FHI callback for Correctable Errors */ ret = register_fhi_callback(&fhi_callback, pdev); if (ret) { dev_err(dev, "Failed to register FHI callback\n"); return -ENOENT; } /* Ensure that any CPU brought online sets up RAS */ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ras_carmel:online", carmel_ras_enable_callback, NULL); if (ret < 0) { dev_err(dev, "unable to register cpu hotplug state\n"); return ret; } hp_state = ret; /* register SERR for Uncorrectable Errors */ register_serr_hook(&core_serr_callback); register_serr_hook(&corecluster_serr_callback); register_serr_hook(&ccplex_serr_callback); ret = ras_carmel_dbgfs_init(); if (ret) return ret; dev_info(dev, "probed"); return 0; } static int ras_carmel_remove(struct platform_device *pdev) { unregister_fhi_callback(&fhi_callback); unregister_serr_hook(&core_serr_callback); unregister_serr_hook(&corecluster_serr_callback); unregister_serr_hook(&ccplex_serr_callback); cpuhp_remove_state(hp_state); ras_unregister_core_ers(); ras_unregister_corecluster_ers(); ras_unregister_ccplex_ers(); return 0; } static const struct of_device_id ras_carmel_of_match[] = { { .name = "carmel_ras", .compatible = "nvidia,carmel-ras", }, { }, }; MODULE_DEVICE_TABLE(of, ras_carmel_of_match); static struct platform_driver ras_carmel_driver = { .probe = ras_carmel_probe, .remove = ras_carmel_remove, .driver = { .owner = THIS_MODULE, .name = "carmel_ras", .of_match_table = of_match_ptr(ras_carmel_of_match), }, }; static int __init ras_carmel_init(void) { return platform_driver_register(&ras_carmel_driver); } static void __exit ras_carmel_exit(void) { platform_driver_unregister(&ras_carmel_driver); } arch_initcall(ras_carmel_init); module_exit(ras_carmel_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("Carmel RAS handler");