ACCRE R9 Cluster Quick and Dirty Status

Report generated at Thu Apr 3 03:23:01 AM CDT 2025

Problem Nodes

HOSTNAMES      STATE      AVAIL_FEATURES                 TIMESTAMP            USER       REASON                                              
cn457          down*      sandybridge                    2025-03-06T14:07:28  slurm      Not responding                                      
cn1124         drained    sandybridge                    2025-03-28T16:58:34  slurm      batch job complete failure                          
cn1131         drained*   sandybridge                    2025-02-06T16:56:30  slurm      Sai - RT 90326 - Requires troubleshooting configurat
cn1275         draining   haswell                        2025-04-02T03:56:58  root       Kill task failed                                    
cn1298         down*      haswell                        2025-03-28T07:10:00  slurm      Not responding                                      
cn1377         drained    haswell                        2025-02-24T12:51:55  appelte1   Nobody - RT90691 - check memory instability         
cn1399         drained*   haswell                        2025-02-27T09:52:56  root       Scott Took - RT 90327 - Not reachable               
cn1413         drained*   haswell,sbcstmp                2025-03-25T16:59:30  ninchrsc   Samuel - Bad - Decom for part                       
cn1424         drained*   haswell                        2025-03-17T14:09:17  root       Provisioning - Scott - resume when R9 and GREEN     
cn1427         drained*   haswell                        2025-03-13T07:24:41  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
cn1477         draining   skylake                        2025-04-02T05:03:22  root       Kill task failed                                    
cn1484         down       skylake                        2025-04-03T00:36:37  slurm      Node unexpectedly rebooted                          
cn1486         down       skylake                        2025-04-03T02:39:41  slurm      Node unexpectedly rebooted                          
cn1488         down       skylake                        2025-04-03T00:37:20  slurm      Node unexpectedly rebooted                          
cn1492         down       skylake                        2025-04-03T00:36:48  slurm      Node unexpectedly rebooted                          
cn1504         drained    skylake                        2025-02-16T13:51:45  slurm      Nobody - 90507 - NIC flapping : Not responding      
cn1508         down       skylake                        2025-04-03T01:59:17  slurm      Node unexpectedly rebooted                          
cn1509         down       skylake                        2025-04-03T00:36:20  slurm      Node unexpectedly rebooted                          
cn1514         drained    skylake                        2025-03-17T09:07:25  slurm      Nobody - RT91150 - Network link flapping : Not respo
cn1515         down       skylake                        2025-04-03T02:07:32  slurm      Node unexpectedly rebooted                          
cn1517         down       skylake                        2025-04-03T00:36:28  slurm      Node unexpectedly rebooted                          
cn1519         down       skylake                        2025-04-03T00:36:11  slurm      Node unexpectedly rebooted                          
cn1527         down       skylake                        2025-04-03T00:37:20  slurm      Node unexpectedly rebooted                          
cn1528         down       skylake                        2025-04-03T02:21:49  slurm      Node unexpectedly rebooted                          
cn1539         drained    cascadelake                    2025-02-05T16:04:39  root       Alan - RT N/A - NO TOUCHY?                          
cn1541         drained*   cascadelake                    2025-02-28T16:01:10  root       Samuel - Bad - Decom for parts                      
cn1543         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1546         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1547         drained    cascadelake                    2025-03-31T14:04:15  slurm      Prolog error                                        
cn1552         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1553         drained    cascadelake                    2025-03-21T08:57:37  appelte1   Nobody - RT91248 - undiagnosed system instability   
cn1556         drained    cascadelake                    2025-03-21T08:57:37  appelte1   Nobody - RT91248 - undiagnosed system instability   
cn1557         drained    cascadelake                    2025-03-21T08:57:37  appelte1   Nobody - RT91248 - undiagnosed system instability   
cn1561         draining   cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1564         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1572         draining   cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1575         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1576         down       cascadelake                    2025-04-03T03:14:33  slurm      Node unexpectedly rebooted                          
cn1577         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1580         drained*   cascadelake                    2025-03-13T07:24:41  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
cn1582         down*      cascadelake                    2025-03-31T16:30:48  slurm      Not responding                                      
cn1585         drained    cascadelake                    2025-03-31T14:04:15  slurm      Prolog error                                        
cn1588         drained    cascadelake                    2025-04-01T09:42:57  root       Samuel - RT91355 - Bad Batteries                    
cn1590         down*      cascadelake                    2025-03-20T02:06:11  slurm      Not responding                                      
cn1596         down*      zen                            2025-03-20T02:12:51  slurm      Not responding                                      
cn1605         down       zen                            2025-04-03T03:17:54  slurm      Node unexpectedly rebooted                          
cn1611         drained    zen                            2025-02-28T10:53:50  root       Nobody - RT90770 - system instability causing reboot
cn1613         draining   zen                            2025-04-02T21:53:01  slurm      Prolog error                                        
cn1617         drained    zen                            2025-04-02T09:50:58  root       Kill task failed                                    
cn1624         draining   zen                            2025-04-02T08:18:55  slurm      Prolog error                                        
gpu0016        drained*   broadwell,pascal,p3584         2025-03-25T08:20:38  root       Nobody - RT 90330 - Migrate to R9                   
gpu0024        drained*   broadwell,pascal,p3840         2025-03-25T08:20:38  root       Nobody - RT 90330 - Migrate to R9                   
gpu0036        drained*   skylake,turing,csbtmp          2025-03-25T08:20:38  root       Nobody - RT 90330 - Migrate to R9                   
gpu0040        inval      skylake,turing,csbtmp          2025-03-06T18:52:31  slurm      Low RealMemory (reported:353824 < 100.00% of configu
gpu0044        inval      skylake,turing,csbtmp          2025-04-01T11:54:45  slurm      gres/gpu count reported lower than configured (2 < 4
gpu0053        drained*   skylake,turing                 2025-03-13T07:23:41  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
gpu0058        drained*   skylake,a4000x4                2025-03-11T07:43:50  appelte1   Provisioning - Nobody - resume when R9 and GREEN    
gpu0060        down*      zen3,a4000x8                   2025-04-01T09:04:33  slurm      Not responding                                      
gpu0071        drained    icelake,a6000x4,csbtmp         2025-02-24T10:17:28  appelte1   Brandon - RT90079 - testing nvlink daemon           
gpu0083        drained*   zen,a100                       2025-03-17T13:19:21  root       On Rocky - No network                               
gracehopper01  down*      aarch,hopper                   2025-04-01T09:51:28  slurm      Not responding                                      
gracehopper02  down*      aarch,hopper                   2025-04-01T09:51:28  slurm      Not responding                                      
hgx01          unknown*   zen,h100                       Unknown              root       none                                                

Queue Summary (Batch)

GROUP        USER                  ACTIVE_JOBS  ACTIVE_CORES  PENDING_JOBS  PENDING_CORES
-----------------------------------------------------------------------------------------
accre                                  1            4             0             0
            appelte1                   1            4             0             0
-----------------------------------------------------------------------------------------
aldrich_lab                            1            1             0             0
            amannn1                    1            1             0             0
-----------------------------------------------------------------------------------------
beam_lab                             240          300           123          1476
            khancm                   236          236             0             0
            weeksae                    0            0           123          1476
            zhuj29                     4           64             0             0
-----------------------------------------------------------------------------------------
booth_lab                             16           16            38            38
            chenh55                   12           12            38            38
            comptoab                   1            1             0             0
            mathura                    1            1             0             0
            muesm                      1            1             0             0
            wanj129                    1            1             0             0
-----------------------------------------------------------------------------------------
brg_cores                             15           97             0             0
            desilvt                   13           65             0             0
            kandelr                    1           16             0             0
            xuy33                      1           16             0             0
-----------------------------------------------------------------------------------------
cds_group                              1           13             0             0
            shress6                    1           13             0             0
-----------------------------------------------------------------------------------------
cms                                 1807         4597           158           392
            cmslocal                  95          299             0             0
            cmspilot                1712         4298           158           392
-----------------------------------------------------------------------------------------
csb_sanders                            1           25             0             0
            lig7                       1           25             0             0
-----------------------------------------------------------------------------------------
hadjim_lab                             3           13             0             0
            comers                     2            9             0             0
            reasosa2                   1            4             0             0
-----------------------------------------------------------------------------------------
h_cqs                                  6           72             0             0
            shengq2                    6           72             0             0
-----------------------------------------------------------------------------------------
h_darby_lab                            2            4             0             0
            phant2                     2            4             0             0
-----------------------------------------------------------------------------------------
h_vmac                                11          176           650          3900
            regelsan                  11          176           650          3900
-----------------------------------------------------------------------------------------
h_vuiis                               10           14             0             0
            vuiis_daily_s             10           14             0             0
-----------------------------------------------------------------------------------------
jswhep                                 1            1             0             0
            maegliam                   1            1             0             0
-----------------------------------------------------------------------------------------
l3_bick_lab                            2           40             0             0
            qianh4                     2           40             0             0
-----------------------------------------------------------------------------------------
l3_precision_nutriti                     1           40             0             0
            baghem1                    1           40             0             0
-----------------------------------------------------------------------------------------
mchs_compbio                           4            5             0             0
            guevaa1                    2            3             0             0
            guevam5                    2            2             0             0
-----------------------------------------------------------------------------------------
mcml                                   1           16             0             0
            subravvr                   1           16             0             0
-----------------------------------------------------------------------------------------
nbody                                  1            4             0             0
            ligo                       1            4             0             0
-----------------------------------------------------------------------------------------
neurogroup                             1            1             0             0
            doumaa                     1            1             0             0
-----------------------------------------------------------------------------------------
palmeri_lab                            1            1             0             0
            jeongj6                    1            1             0             0
-----------------------------------------------------------------------------------------
p_csb_meiler                          59          313             0             0
            cheonglb                   1            8             0             0
            tydingcw                  19          266             0             0
            vogtsh                    39           39             0             0
-----------------------------------------------------------------------------------------
p_masi                                17           30             0             0
            amandm1                   10           19             0             0
            kimm58                     5            7             0             0
            mcmastem                   2            4             0             0
-----------------------------------------------------------------------------------------
p_masi_vuiis                           6           12             0             0
            vuiis_archive              6           12             0             0
-----------------------------------------------------------------------------------------
rokaslab                              23          119             2             2
            davidkt                   17           17             0             0
            riedlio                    2           80             0             0
            sautet1                    4           22             2             2
-----------------------------------------------------------------------------------------
sbcs                                  28           56             1             1
            lyul1                     28           56             1             1
-----------------------------------------------------------------------------------------
taylor_group                           3            6             0             0
            petrop3                    2            4             0             0
            schultls                   1            2             0             0
-----------------------------------------------------------------------------------------
walker_lab                             7           76             0             0
            fieldhm                    1            1             0             0
            hec7                       3           48             0             0
            ravacm                     1            1             0             0
            walkeas2                   2           26             0             0
-----------------------------------------------------------------------------------------
wankowicz_lab                        553          555          1279          1279
            seol                       1            3             0             0
            wankows                  552          552          1279          1279
-----------------------------------------------------------------------------------------
williams_roberson_la                     1            1             0             0
            vundav2                    1            1             0             0
-----------------------------------------------------------------------------------------
womelsdorf_lab                        31          775            29           725
            azezewka                  31          775            29           725
-----------------------------------------------------------------------------------------
yang_lab_csb                           2            2             0             0
            jurichc                    2            2             0             0
-----------------------------------------------------------------------------------------
Totals:                             2856         7385          2280          7813

Queue Summary (Batch GPU)

GROUP        USER                  ACTIVE_JOBS  ACTIVE_GPUS   PENDING_JOBS   PENDING_GPUS
-----------------------------------------------------------------------------------------
csb_gpu_acc                           64           64           160           162
            browkl12                   0            0             2             2
            changga                   57           57           149           149
            cryosparcuser              1            1             1             1
            karadim                    0            0             5             5
            melarafj                   5            5             0             0
            tranmh                     1            1             3             5
-----------------------------------------------------------------------------------------
vuiis_masi_gpu_acc                     0            0             2             3
            kimm58                     0            0             1             1
            liuy140                    0            0             1             2
-----------------------------------------------------------------------------------------
yang_lab_csb                           0            0             1             1
            ranx                       0            0             1             1
-----------------------------------------------------------------------------------------
Totals:                               64           64           163           166

Queue Summary (interactive)

GROUP        USER                  ACTIVE_JOBS  ACTIVE_CORES  PENDING_JOBS  PENDING_CORES
-----------------------------------------------------------------------------------------
p_dsi_dgx                              2           96             0             0
            huy28                      2           96             0             0
-----------------------------------------------------------------------------------------
Totals:                                2           96             0             0

Partition Summary

PARTITION   AVAIL  TIMELIMIT  NODES  STATE NODELIST
interactive    up 14-00:00:0      1    mix dgx01
interactive    up 14-00:00:0      5   idle cn[0001,1287],dgx[02-04]
batch*         up 14-00:00:0      6 drain* cn[1399,1413,1424,1427,1541,1580]
batch*         up 14-00:00:0      4  down* cn[1298,1582,1590,1596]
batch*         up 14-00:00:0      1   comp cn1555
batch*         up 14-00:00:0      6   drng cn[1275,1477,1561,1572,1613,1624]
batch*         up 14-00:00:0     18  drain cn[1377,1504,1514,1539,1543,1546-1547,1552-1553,1556-1557,1564,1575,1577,1585,1588,1611,1617]
batch*         up 14-00:00:0    142    mix cn[1230,1258,1267-1268,1273,1278,1280,1282,1284,1309-1310,1313-1314,1316,1409,1412,1417,1420,1425,1430-1432,1434-1443,1445-1458,1460-1464,1466-1476,1478-1481,1493,1500,1526,1530-1538,1540,1544-1545,1548-1551,1554,1558-1559,1562-1563,1565-1571,1573-1574,1578-1579,1581,1583-1584,1586-1587,1589,1591-1595,1597,1601-1604,1606-1610,1612,1614,1618-1623,1625-1633,1700,1702-1703]
batch*         up 14-00:00:0    132  alloc cn[1201-1229,1231-1242,1257,1259-1262,1264-1266,1269-1272,1274,1276-1277,1279,1281,1283,1285-1286,1288-1297,1299,1303-1305,1308,1312,1327,1364,1400-1408,1410-1411,1414-1416,1418-1419,1423,1426,1482-1483,1485,1487,1489-1491,1494-1499,1501-1503,1505-1507,1510-1513,1516,1518,1520-1525,1529,1615-1616,1705]
batch*         up 14-00:00:0     13   down cn[1484,1486,1488,1492,1508-1509,1515,1517,1519,1527-1528,1576,1605]
legacy_hw      up 14-00:00:0      1 drain* cn1131
legacy_hw      up 14-00:00:0      1  down* cn457
legacy_hw      up 14-00:00:0      1  drain cn1124
batch_gpu      up 14-00:00:0      2  inval gpu[0040,0044]
batch_gpu      up 14-00:00:0      6 drain* gpu[0016,0024,0036,0053,0058,0083]
batch_gpu      up 14-00:00:0      3  down* gpu0060,gracehopper[01-02]
batch_gpu      up 14-00:00:0      1   unk* hgx01
batch_gpu      up 14-00:00:0      1  drain gpu0071
batch_gpu      up 14-00:00:0     20    mix gpu[0037,0039,0041-0042,0062-0070,0072-0076,0300-0301]
batch_gpu      up 14-00:00:0     16   idle gpu[0050,0056-0057,0061,0084-0085,0302-0310],hgx02
sam            up 2-02:00:00      2   idle cms-sam-[01-02]