sb_edac: Fix support for systems with two home agents per socket
First noticed a problem on a 4 socket machine where EDAC only reported
half the DIMMS. Tracked this down to the code that assumes that systems
with two home agents only have two memory channels on each agent. This
is true on 2 sockect ("-EP") machines. But four socket ("-EX") machines
have four memory channels on each home agent.
The old code would have had problems on two socket systems as it did
a shuffling trick to make the internals of the code think that the
channels from the first agent were '0' and '1', with the second agent
providing '2' and '3'. But the code didn't uniformly convert from
{ha,channel} tuples to this internal representation.
New code always considers up to eight channels.
On a machine with a single home agent these map easily to edac channels
0, 1, 2, 3. On machines with two home agents we map using:
edac_channel = 4*ha# + channel
So on a -EP machine where each home agent supports only two channels
we'll fill in channels 0, 1, 4, 5, and on a -EX machine we use all of 0,
1, 2, 3, 4, 5, 6, 7.
[mchehab@osg.samsung.com: fold a fixup patch as per Tony's request and fixed
a few CodingStyle issues]
Signed-off-by: Tony Luck <tony.luck@intel.com>
Acked-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index b964832..be961d4 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -34,7 +34,7 @@
/*
* Alter this version for the module when modifications are made
*/
-#define SBRIDGE_REVISION " Ver: 1.1.0 "
+#define SBRIDGE_REVISION " Ver: 1.1.1 "
#define EDAC_MOD_STR "sbridge_edac"
/*
@@ -254,7 +254,7 @@
* sbridge structs
*/
-#define NUM_CHANNELS 4
+#define NUM_CHANNELS 8 /* 2MC per socket, four chan per MC */
#define MAX_DIMMS 3 /* Max DIMMS per channel */
#define CHANNEL_UNSPECIFIED 0xf /* Intel IA32 SDM 15-14 */
@@ -393,6 +393,8 @@
#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS 0x0e79
#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0 0x0e6a
#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1 0x0e6b
+#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2 0x0e6c
+#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3 0x0e6d
static const struct pci_id_descr pci_dev_descr_ibridge[] = {
/* Processor Home Agent */
@@ -421,6 +423,8 @@
#endif
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1, 1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2, 1) },
+ { PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3, 1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_1HA_DDRIO0, 1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_2HA_DDRIO0, 1) },
@@ -909,6 +913,8 @@
for (i = 0; i < NUM_CHANNELS; i++) {
u32 mtr;
+ if (!pvt->pci_tad[i])
+ continue;
for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
i, j, 0);
@@ -925,8 +931,8 @@
size = ((u64)rows * cols * banks * ranks) >> (20 - 3);
npages = MiB_TO_PAGES(size);
- edac_dbg(0, "mc#%d: channel %d, dimm %d, %Ld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n",
- pvt->sbridge_dev->mc, i, j,
+ edac_dbg(0, "mc#%d: ha %d channel %d, dimm %d, %lld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n",
+ pvt->sbridge_dev->mc, i/4, i%4, j,
size, npages,
banks, ranks, rows, cols);
@@ -946,8 +952,8 @@
dimm->mtype = mtype;
dimm->edac_mode = mode;
snprintf(dimm->label, sizeof(dimm->label),
- "CPU_SrcID#%u_Channel#%u_DIMM#%u",
- pvt->sbridge_dev->source_id, i, j);
+ "CPU_SrcID#%u_Ha#%u_Chan#%u_DIMM#%u",
+ pvt->sbridge_dev->source_id, i/4, i%4, j);
}
}
}
@@ -1128,7 +1134,7 @@
static int get_memory_error_data(struct mem_ctl_info *mci,
u64 addr,
- u8 *socket,
+ u8 *socket, u8 *ha,
long *channel_mask,
u8 *rank,
char **area_type, char *msg)
@@ -1141,7 +1147,7 @@
int interleave_mode, shiftup = 0;
unsigned sad_interleave[pvt->info.max_interleave];
u32 reg, dram_rule;
- u8 ch_way, sck_way, pkg, sad_ha = 0;
+ u8 ch_way, sck_way, pkg, sad_ha = 0, ch_add = 0;
u32 tad_offset;
u32 rir_way;
u32 mb, gb;
@@ -1254,6 +1260,8 @@
pkg = sad_pkg(pvt->info.interleave_pkg, reg, idx);
*socket = sad_pkg_socket(pkg);
sad_ha = sad_pkg_ha(pkg);
+ if (sad_ha)
+ ch_add = 4;
if (a7mode) {
/* MCChanShiftUpEnable */
@@ -1270,10 +1278,14 @@
pkg = sad_pkg(pvt->info.interleave_pkg, reg, idx);
*socket = sad_pkg_socket(pkg);
sad_ha = sad_pkg_ha(pkg);
+ if (sad_ha)
+ ch_add = 4;
edac_dbg(0, "SAD interleave package: %d = CPU socket %d, HA %d\n",
idx, *socket, sad_ha);
}
+ *ha = sad_ha;
+
/*
* Move to the proper node structure, in order to access the
* right PCI registers
@@ -1346,7 +1358,7 @@
}
*channel_mask = 1 << base_ch;
- pci_read_config_dword(pvt->pci_tad[base_ch],
+ pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
tad_ch_nilv_offset[n_tads],
&tad_offset);
@@ -1405,7 +1417,7 @@
* Step 3) Decode rank
*/
for (n_rir = 0; n_rir < MAX_RIR_RANGES; n_rir++) {
- pci_read_config_dword(pvt->pci_tad[base_ch],
+ pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
rir_way_limit[n_rir],
®);
@@ -1435,7 +1447,7 @@
idx = (ch_addr >> 13); /* FIXME: Datasheet says to shift by 15 */
idx %= 1 << rir_way;
- pci_read_config_dword(pvt->pci_tad[base_ch],
+ pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
rir_offset[n_rir][idx],
®);
*rank = RIR_RNK_TGT(reg);
@@ -1681,16 +1693,9 @@
struct sbridge_dev *sbridge_dev)
{
struct sbridge_pvt *pvt = mci->pvt_info;
- struct pci_dev *pdev, *tmp;
+ struct pci_dev *pdev;
+ u8 saw_chan_mask = 0;
int i;
- bool mode_2ha = false;
-
- tmp = pci_get_device(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, NULL);
- if (tmp) {
- mode_2ha = true;
- pci_dev_put(tmp);
- }
for (i = 0; i < sbridge_dev->n_devs; i++) {
pdev = sbridge_dev->pdev[i];
@@ -1706,26 +1711,21 @@
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS:
pvt->pci_ras = pdev;
break;
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD2:
- case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3:
- /* if we have 2 HAs active, channels 2 and 3
- * are in other device */
- if (mode_2ha)
- break;
- /* fall through */
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD1:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD2:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3:
{
int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0;
pvt->pci_tad[id] = pdev;
+ saw_chan_mask |= 1 << id;
}
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_2HA_DDRIO0:
pvt->pci_ddrio = pdev;
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_1HA_DDRIO0:
- if (!mode_2ha)
- pvt->pci_ddrio = pdev;
+ pvt->pci_ddrio = pdev;
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_SAD:
pvt->pci_sad0 = pdev;
@@ -1741,13 +1741,12 @@
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2:
+ case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3:
{
- int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0 + 2;
-
- /* we shouldn't have this device if we have just one
- * HA present */
- WARN_ON(!mode_2ha);
+ int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0 + 4;
pvt->pci_tad[id] = pdev;
+ saw_chan_mask |= 1 << id;
}
break;
default:
@@ -1766,10 +1765,10 @@
!pvt->pci_ta)
goto enodev;
- for (i = 0; i < NUM_CHANNELS; i++) {
- if (!pvt->pci_tad[i])
- goto enodev;
- }
+ if (saw_chan_mask != 0x0f && /* -EN */
+ saw_chan_mask != 0x33 && /* -EP */
+ saw_chan_mask != 0xff) /* -EX */
+ goto enodev;
return 0;
enodev:
@@ -1787,16 +1786,9 @@
struct sbridge_dev *sbridge_dev)
{
struct sbridge_pvt *pvt = mci->pvt_info;
- struct pci_dev *pdev, *tmp;
+ struct pci_dev *pdev;
+ u8 saw_chan_mask = 0;
int i;
- bool mode_2ha = false;
-
- tmp = pci_get_device(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1, NULL);
- if (tmp) {
- mode_2ha = true;
- pci_dev_put(tmp);
- }
/* there's only one device per system; not tied to any bus */
if (pvt->info.pci_vtd == NULL)
@@ -1827,18 +1819,26 @@
pvt->pci_ras = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0:
- pvt->pci_tad[0] = pdev;
- break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD1:
- pvt->pci_tad[1] = pdev;
- break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD2:
- if (!mode_2ha)
- pvt->pci_tad[2] = pdev;
- break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD3:
- if (!mode_2ha)
- pvt->pci_tad[3] = pdev;
+ {
+ int id = pdev->device - PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0;
+
+ pvt->pci_tad[id] = pdev;
+ saw_chan_mask |= 1 << id;
+ }
+ break;
+ case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0:
+ case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD1:
+ case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD2:
+ case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD3:
+ {
+ int id = pdev->device - PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0 + 4;
+
+ pvt->pci_tad[id] = pdev;
+ saw_chan_mask |= 1 << id;
+ }
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0:
pvt->pci_ddrio = pdev;
@@ -1849,14 +1849,6 @@
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA:
pvt->pci_ha1_ta = pdev;
break;
- case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0:
- if (mode_2ha)
- pvt->pci_tad[2] = pdev;
- break;
- case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD1:
- if (mode_2ha)
- pvt->pci_tad[3] = pdev;
- break;
default:
break;
}
@@ -1872,10 +1864,10 @@
!pvt->pci_ras || !pvt->pci_ta || !pvt->info.pci_vtd)
goto enodev;
- for (i = 0; i < NUM_CHANNELS; i++) {
- if (!pvt->pci_tad[i])
- goto enodev;
- }
+ if (saw_chan_mask != 0x0f && /* -EN */
+ saw_chan_mask != 0x33 && /* -EP */
+ saw_chan_mask != 0xff) /* -EX */
+ goto enodev;
return 0;
enodev:
@@ -1986,7 +1978,7 @@
u32 channel = GET_BITFIELD(m->status, 0, 3);
u32 optypenum = GET_BITFIELD(m->status, 4, 6);
long channel_mask, first_channel;
- u8 rank, socket;
+ u8 rank, socket, ha;
int rc, dimm;
char *area_type = NULL;
@@ -2048,7 +2040,7 @@
if (!GET_BITFIELD(m->status, 58, 58))
return;
- rc = get_memory_error_data(mci, m->addr, &socket,
+ rc = get_memory_error_data(mci, m->addr, &socket, &ha,
&channel_mask, &rank, &area_type, msg);
if (rc < 0)
goto err_parsing;
@@ -2080,12 +2072,12 @@
channel = first_channel;
snprintf(msg, sizeof(msg),
- "%s%s area:%s err_code:%04x:%04x socket:%d channel_mask:%ld rank:%d",
+ "%s%s area:%s err_code:%04x:%04x socket:%d ha:%d channel_mask:%ld rank:%d",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
area_type,
mscod, errcode,
- socket,
+ socket, ha,
channel_mask,
rank);
@@ -2099,7 +2091,7 @@
/* Call the helper to output message */
edac_mc_handle_error(tp_event, mci, core_err_cnt,
m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
- channel, dimm, -1,
+ 4*ha+channel, dimm, -1,
optype, msg);
return;
err_parsing: