/* * RAS driver for T194 * * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. */ #include #include #include #include #include #include static int fhi_irq[CONFIG_NR_CPUS]; static u8 is_ras_probe_done; static LIST_HEAD(fhi_callback_list); static DEFINE_RAW_SPINLOCK(fhi_lock); /* saved hotplug state */ static enum cpuhp_state hp_state; static const struct of_device_id arm64_ras_match[] = { { .name = "arm64_ras", .compatible = "arm,armv8.2-ras", }, { }, }; MODULE_DEVICE_TABLE(of, arm64_ras_match); /* Architecturally defined primary error code * Used for decoding ERRSTATUS:SERR bits */ static struct ras_error serr_errors[] = { {.name = "No error", .error_code = 0x0}, {.name = "Implementation defined error", .error_code = 0x1}, {.name = "Data value from non-associative internal memory", .error_code = 0x2}, {.name = "Implementation defined pin", .error_code = 0x3}, {.name = "Assertion Failure", .error_code = 0x4}, {.name = "Internal Data Path", .error_code = 0x5}, {.name = "Data value from associative memory", .error_code = 0x6}, {.name = "Address/control value(s) from associative memory", .error_code = 0x7}, {.name = "Data value from a TLB", .error_code = 0x8}, {.name = "Address/control value(s) from a TLB", .error_code = 0x9}, {.name = "Data value from producer", .error_code = 0xA}, {.name = "Address/control value(s) from producer", .error_code = 0xB}, {.name = "Data value from (non-associative) external memory", .error_code = 0xC}, {.name = "Illegal address (software fault)", .error_code = 0xD}, {.name = "Illegal access (software fault)", .error_code = 0xE}, {.name = "Illegal state (software fault)", .error_code = 0xF}, {.name = "Internal data register", .error_code = 0x10}, {.name = "Internal control register", .error_code = 0x11}, {.name = "Error response from slave", .error_code = 0x12}, {.name = "External timeout", .error_code = 0x13}, {.name = "Internal timeout", .error_code = 0x14}, {.name = "Deferred error from slave not supported at master", .error_code = 0x15}, }; /* Read ERRSELR */ u64 ras_read_errselr(void) { u64 errselr; asm volatile("mrs %0, s3_0_c5_c3_1" : "=r" (errselr)); return errselr; } EXPORT_SYMBOL(ras_read_errselr); /* For read/write of any ERR register, error record X is selected * by writing to ERRSELR_EL1 * ISB is needed in the end to ensure this write completes before another * read/write to this error record comes in. */ void ras_write_errselr(u64 errx) { asm volatile("msr s3_0_c5_c3_1, %0" : : "r" (errx)); isb(); } EXPORT_SYMBOL(ras_write_errselr); /* Read ERRSTATUS */ u64 ras_read_error_status(void) { u64 status; asm volatile("mrs %0, s3_0_c5_c4_2" : "=r" (status)); return status; } EXPORT_SYMBOL(ras_read_error_status); /* Write to ERRSTATUS */ void ras_write_error_status(u64 status) { asm volatile("msr s3_0_c5_c4_2, %0" : : "r" (status)); } EXPORT_SYMBOL(ras_write_error_status); /* Read ERRADDR */ static u64 ras_read_error_address(void) { u64 addr; asm volatile("mrs %0, s3_0_c5_c4_3" : "=r" (addr)); return addr; } /* Write to ERRADDR*/ void ras_write_error_addr(u64 addr) { asm volatile("msr s3_0_c5_c4_3, %0" : : "r" (addr)); } EXPORT_SYMBOL(ras_write_error_addr); /* Read ERRMISC0 */ static u64 ras_read_error_misc0(void) { u64 misc0; asm volatile("mrs %0, s3_0_c5_c5_0" : "=r" (misc0)); return misc0; } /* Write to ERRMISC0*/ void ras_write_error_misc0(u64 misc0) { asm volatile("msr s3_0_c5_c5_0, %0" : : "r" (misc0)); } EXPORT_SYMBOL(ras_write_error_misc0); /* Read ERRMISC1 */ static u64 ras_read_error_misc1(void) { u64 misc1; asm volatile("mrs %0, s3_0_c5_c5_1" : "=r" (misc1)); return misc1; } /* Write to ERRMISC1*/ void ras_write_error_misc1(u64 misc1) { asm volatile("msr s3_0_c5_c5_1, %0" : : "r" (misc1)); } EXPORT_SYMBOL(ras_write_error_misc1); /* Read ERRCTLR_EL1*/ u64 ras_read_error_control(void) { u64 err_ctl; asm volatile("mrs %0, s3_0_c5_c4_1" : "=r" (err_ctl)); return err_ctl; } EXPORT_SYMBOL(ras_read_error_control); /* Write to ERRCTLR_EL1*/ void ras_write_error_control(u64 err_ctl) { asm volatile("msr s3_0_c5_c4_1, %0" : : "r" (err_ctl)); } EXPORT_SYMBOL(ras_write_error_control); /* Read ERRPFGCTL_EL1*/ u64 ras_read_pfg_control(void) { u64 pfg_ctl; asm volatile("mrs %0, s3_0_c15_c1_4" : "=r" (pfg_ctl)); return pfg_ctl; } EXPORT_SYMBOL(ras_read_pfg_control); /* Write to ERRPFGCTL_EL1*/ void ras_write_pfg_control(u64 pfg_ctl) { asm volatile("msr s3_0_c15_c1_4, %0" : : "r" (pfg_ctl)); } EXPORT_SYMBOL(ras_write_pfg_control); /* Read ERRPFGCDN_EL1*/ u64 ras_read_pfg_cdn(void) { u64 pfg_cdn; asm volatile("mrs %0, s3_0_c15_c1_6" : "=r" (pfg_cdn)); return pfg_cdn; } EXPORT_SYMBOL(ras_read_pfg_cdn); /* Write to ERRPFGCDN_EL1*/ void ras_write_pfg_cdn(u64 pfg_cdn) { asm volatile("msr s3_0_c15_c1_6, %0" : : "r" (pfg_cdn)); } EXPORT_SYMBOL(ras_write_pfg_cdn); void unregister_fhi_callback(struct ras_fhi_callback *callback) { unsigned long flags; raw_spin_lock_irqsave(&fhi_lock, flags); list_del(&callback->node); raw_spin_unlock_irqrestore(&fhi_lock, flags); } EXPORT_SYMBOL(unregister_fhi_callback); static unsigned int read_pfr0_ras_version(void) { return PFR0_RAS(read_cpuid(ID_AA64PFR0_EL1)); } static void is_ras_version1(void *ret) { *(int *)(ret) = (read_pfr0_ras_version() == PFR0_RAS_VERSION_1); } /* check if RAS is supported on this CPU * ret = 0 if not supported * = 1 if supported */ int is_ras_cpu(int cpu) { int ret; smp_call_function_single(cpu, is_ras_version1, &ret, 1); return ret; } EXPORT_SYMBOL(is_ras_cpu); int is_this_ras_cpu(void) { return read_pfr0_ras_version() == PFR0_RAS_VERSION_1; } EXPORT_SYMBOL(is_this_ras_cpu); int is_ras_ready(void) { return is_ras_probe_done == 1; } EXPORT_SYMBOL(is_ras_ready); void print_error_record(struct error_record *record, u64 status, int errselr) { struct ras_error *errors; u64 misc0, misc1, addr; u16 ierr, serr; u64 i; int found = 0; u64 err_status = 0; pr_crit("**************************************\n"); pr_crit("RAS Error in %s, ERRSELR_EL1=%d:\n", record->name, errselr); pr_crit("\tStatus = 0x%llx\n", status); /* Find the name of IERR */ ierr = get_error_status_ierr(status); errors = record->errors; if (errors) { for (i = 0; errors[i].name; i++) { if (errors[i].error_code == ierr) { pr_crit("\tIERR = %s: 0x%x\n", errors[i].name, ierr); found = 1; break; } } if (!found) pr_crit("\tUnknown IERR: 0x%x\n", ierr); } else { pr_crit("\tBank does not have any known IERR errors\n"); } /* Find the name of SERR */ found = 0; serr = get_error_status_serr(status); for (i = 0; serr_errors[i].name; i++) { if (serr_errors[i].error_code == serr) { pr_crit("\tSERR = %s: 0x%x\n", serr_errors[i].name, serr); found = 1; break; } } if (!found) pr_crit("\tUnknown SERR: 0x%x\n", serr); if (status & ERRi_STATUS_OF) { pr_crit("\tOverflow (there may be more errors) - Uncorrectable\n"); /* Clear the error by writing 1 to ERRi_STATUS:OF */ err_status |= ERRi_STATUS_OF; } if (status & ERRi_STATUS_UE) { pr_crit("\tUncorrectable (this is fatal)\n"); /* Clear the error by writing 1 to ERR_STATUS:UE * and 11 to ERR_STATUS:UET */ err_status |= ERRi_STATUS_UE; err_status |= ERRi_STATUS_UET; } if (get_error_status_ce(status)) { pr_crit("\tCorrectable Error\n"); /* Clear the error by writing 1 to ERR_STATUS:CE */ err_status |= ERRi_STATUS_CE; } if (status & ERRi_STATUS_MV) { misc0 = ras_read_error_misc0(); misc1 = ras_read_error_misc1(); pr_crit("\tMISC0 = 0x%llx\n", misc0); pr_crit("\tMISC1 = 0x%llx\n", misc1); /* write 1 to clear ERR_STATUS:MV */ err_status |= ERRi_STATUS_MV; } if (status & ERRi_STATUS_AV) { addr = ras_read_error_address(); pr_crit("\tADDR = 0x%llx\n", addr); /* write 1 to clear ERR_STATUS:AV */ err_status |= ERRi_STATUS_AV; } /* Write 1 to clear ERR_STATUS:V */ err_status |= ERRi_STATUS_VALID; /* Write to ERR_STATUS to clear the error */ ras_write_error_status(err_status); pr_crit("**************************************\n"); } EXPORT_SYMBOL(print_error_record); /* Fault Handling Interrupt or FHI * for handling Correctable Errors */ static irqreturn_t ras_fhi_isr(int irq, void *dev_id) { unsigned long flags; struct ras_fhi_callback *callback; /* Iterate through the banks looking for one with an error */ pr_crit("CPU%d: RAS: FHI %d detected\n", smp_processor_id(), irq); raw_spin_lock_irqsave(&fhi_lock, flags); list_for_each_entry(callback, &fhi_callback_list, node) { callback->fn(); } raw_spin_unlock_irqrestore(&fhi_lock, flags); return IRQ_HANDLED; } static int ras_fhi_enable(unsigned int cpu) { if (irq_force_affinity(fhi_irq[cpu], cpumask_of(cpu))) { pr_info("%s: Failed to set IRQ %d affinity to CPU%d\n", __func__, fhi_irq[cpu], cpu); return -EINVAL; } enable_irq(fhi_irq[cpu]); pr_info("%s: FHI %d enabled on CPU%d\n", __func__, fhi_irq[cpu], cpu); return 0; } static int ras_fhi_disable(unsigned int cpu) { int boot_cpu = 0; disable_irq_nosync(fhi_irq[cpu]); if (irq_force_affinity(fhi_irq[cpu], cpumask_of(boot_cpu))) pr_info("%s: Failed to set IRQ %d affinity to boot cpu %d\n", __func__, fhi_irq[cpu], boot_cpu); pr_info("%s: FHI %d disabled\n", __func__, fhi_irq[cpu]); return 0; } /* Register Fault Handling Interrupt or FHI * for Correctable Errors */ static int ras_register_fhi_isr(struct platform_device *pdev) { int err = 0, cpu, irq_count; irq_count = platform_irq_count(pdev); for_each_possible_cpu(cpu) { BUG_ON(cpu >= irq_count); fhi_irq[cpu] = platform_get_irq(pdev, cpu); if (fhi_irq[cpu] <= 0) { dev_err(&pdev->dev, "No IRQ\n"); err = -ENOENT; goto isr_err; } } isr_err: return err; } /* This is an API for CPU specific FHI callbacks * to be registered with fhi_isr handler */ int register_fhi_callback(struct ras_fhi_callback *callback, void *cookie) { unsigned long flags; int err = 0, cpu = 0; raw_spin_lock_irqsave(&fhi_lock, flags); list_add(&callback->node, &fhi_callback_list); raw_spin_unlock_irqrestore(&fhi_lock, flags); for_each_possible_cpu(cpu) { err = request_irq(fhi_irq[cpu], ras_fhi_isr, IRQ_PER_CPU, "ras-fhi", cookie); if (err) { pr_err("%s: request_irq(%d) failed (%d)\n", __func__, fhi_irq[cpu], err); goto isr_err; } disable_irq(fhi_irq[cpu]); } /* Ensure that any CPU brought online sets up FHI */ hp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm64_ras:online", ras_fhi_enable, ras_fhi_disable); if (hp_state < 0) pr_info("%s: unable to register FHI\n", __func__); isr_err: return err; } EXPORT_SYMBOL(register_fhi_callback); static int ras_probe(struct platform_device *pdev) { const struct of_device_id *match; int err = 0, cpu, probe = 0; struct device *dev = &pdev->dev; /* probe only if RAS is supported on either of the online CPUs */ for_each_online_cpu(cpu) { if (is_ras_cpu(cpu)) probe = 1; } if (!probe) { dev_info(dev, "None of the CPUs support RAS"); return -EINVAL; } /* register FHI for Correctable Errors */ match = of_match_device(of_match_ptr(arm64_ras_match), dev); if (!match) { dev_err(dev, "RAS detected, but no device-tree node found," " Cannot register RAS fault handler interrupt (FHI)"); return -ENODEV; } err = ras_register_fhi_isr(pdev); if (err < 0) { dev_info(dev, "Failed to register Fault Handling Interrupts" " ISR"); return err; } /* make sure we have executed everything in the probe * before setting is_ras_probe_done */ smp_mb(); is_ras_probe_done = 1; dev_info(dev, "probed"); return 0; } static int ras_remove(struct platform_device *pdev) { cpuhp_remove_state(hp_state); return 0; } static struct platform_driver ras_driver = { .probe = ras_probe, .remove = ras_remove, .driver = { .owner = THIS_MODULE, .name = "arm64_ras", .of_match_table = of_match_ptr(arm64_ras_match), }, }; static int __init ras_init(void) { return platform_driver_register(&ras_driver); } static void __exit ras_exit(void) { platform_driver_unregister(&ras_driver); } arch_initcall(ras_init); module_exit(ras_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("ARM64 RAS Driver / FHI handler");