@@ -44,7 +44,7 @@ def __init__(self):
44
44
self .gpuMemoryTotal = 0
45
45
self .gpuName = ''
46
46
self .curves = defaultdict (list )
47
-
47
+ self . nvidia_smi = None
48
48
self ._isInit = False
49
49
50
50
def initOnFirstTime (self ):
@@ -53,40 +53,21 @@ def initOnFirstTime(self):
53
53
self ._isInit = True
54
54
55
55
self .cpuFreq = psutil .cpu_freq ().max
56
- self .ramTotal = psutil .virtual_memory ().total / 1024 / 1024 / 1024
56
+ self .ramTotal = psutil .virtual_memory ().total / ( 1024 * 1024 * 1024 )
57
57
58
58
if platform .system () == "Windows" :
59
59
from distutils import spawn
60
60
# If the platform is Windows and nvidia-smi
61
- # could not be found from the environment path,
62
- # try to find it from system drive with default installation path
63
61
self .nvidia_smi = spawn .find_executable ('nvidia-smi' )
64
62
if self .nvidia_smi is None :
65
- self .nvidia_smi = "%s\\ Program Files\\ NVIDIA Corporation\\ NVSMI\\ nvidia-smi.exe" % os .environ ['systemdrive' ]
63
+ # could not be found from the environment path,
64
+ # try to find it from system drive with default installation path
65
+ default_nvidia_smi = "%s\\ Program Files\\ NVIDIA Corporation\\ NVSMI\\ nvidia-smi.exe" % os .environ ['systemdrive' ]
66
+ if os .path .isfile (default_nvidia_smi ):
67
+ self .nvidia_smi = default_nvidia_smi
66
68
else :
67
69
self .nvidia_smi = "nvidia-smi"
68
70
69
- try :
70
- p = subprocess .Popen ([self .nvidia_smi , "-q" , "-x" ], stdout = subprocess .PIPE )
71
- xmlGpu , stdError = p .communicate ()
72
-
73
- smiTree = ET .fromstring (xmlGpu )
74
- gpuTree = smiTree .find ('gpu' )
75
-
76
- try :
77
- self .gpuMemoryTotal = gpuTree .find ('fb_memory_usage' ).find ('total' ).text .split (" " )[0 ]
78
- except Exception as e :
79
- logging .debug ('Failed to get gpuMemoryTotal: "{}".' .format (str (e )))
80
- pass
81
- try :
82
- self .gpuName = gpuTree .find ('product_name' ).text
83
- except Exception as e :
84
- logging .debug ('Failed to get gpuName: "{}".' .format (str (e )))
85
- pass
86
-
87
- except Exception as e :
88
- logging .debug ('Failed to get information from nvidia_smi at init: "{}".' .format (str (e )))
89
-
90
71
def _addKV (self , k , v ):
91
72
if isinstance (v , tuple ):
92
73
for ki , vi in v ._asdict ().items ():
@@ -98,18 +79,23 @@ def _addKV(self, k, v):
98
79
self .curves [k ].append (v )
99
80
100
81
def update (self ):
101
- self .initOnFirstTime ()
102
- self ._addKV ('cpuUsage' , psutil .cpu_percent (percpu = True )) # interval=None => non-blocking (percentage since last call)
103
- self ._addKV ('ramUsage' , psutil .virtual_memory ().percent )
104
- self ._addKV ('swapUsage' , psutil .swap_memory ().percent )
105
- self ._addKV ('vramUsage' , 0 )
106
- self ._addKV ('ioCounters' , psutil .disk_io_counters ())
107
- self .updateGpu ()
82
+ try :
83
+ self .initOnFirstTime ()
84
+ self ._addKV ('cpuUsage' , psutil .cpu_percent (percpu = True )) # interval=None => non-blocking (percentage since last call)
85
+ self ._addKV ('ramUsage' , psutil .virtual_memory ().percent )
86
+ self ._addKV ('swapUsage' , psutil .swap_memory ().percent )
87
+ self ._addKV ('vramUsage' , 0 )
88
+ self ._addKV ('ioCounters' , psutil .disk_io_counters ())
89
+ self .updateGpu ()
90
+ except Exception as e :
91
+ logging .debug ('Failed to get statistics: "{}".' .format (str (e )))
108
92
109
93
def updateGpu (self ):
94
+ if not self .nvidia_smi :
95
+ return
110
96
try :
111
- p = subprocess .Popen ([self .nvidia_smi , "-q" , "-x" ], stdout = subprocess .PIPE )
112
- xmlGpu , stdError = p .communicate ()
97
+ p = subprocess .Popen ([self .nvidia_smi , "-q" , "-x" ], stdout = subprocess .PIPE , stderr = subprocess . PIPE )
98
+ xmlGpu , stdError = p .communicate (timeout = 10 ) # 10 seconds
113
99
114
100
smiTree = ET .fromstring (xmlGpu )
115
101
gpuTree = smiTree .find ('gpu' )
@@ -129,7 +115,11 @@ def updateGpu(self):
129
115
except Exception as e :
130
116
logging .debug ('Failed to get gpuTemperature: "{}".' .format (str (e )))
131
117
pass
132
-
118
+ except subprocess .TimeoutExpired as e :
119
+ logging .debug ('Timeout when retrieving information from nvidia_smi: "{}".' .format (str (e )))
120
+ p .kill ()
121
+ outs , errs = p .communicate ()
122
+ return
133
123
except Exception as e :
134
124
logging .debug ('Failed to get information from nvidia_smi: "{}".' .format (str (e )))
135
125
return
@@ -201,15 +191,19 @@ def update(self, proc):
201
191
data = proc .as_dict (self .dynamicKeys )
202
192
for k , v in data .items ():
203
193
self ._addKV (k , v )
204
-
205
- files = [f .path for f in proc .open_files ()]
206
- if self .lastIterIndexWithFiles != - 1 :
207
- if set (files ) != set (self .openFiles [self .lastIterIndexWithFiles ]):
208
- self .openFiles [self .iterIndex ] = files
209
- self .lastIterIndexWithFiles = self .iterIndex
210
- elif files :
211
- self .openFiles [self .iterIndex ] = files
212
- self .lastIterIndexWithFiles = self .iterIndex
194
+
195
+ ## Note: Do not collect stats about open files for now,
196
+ # as there is bug in psutil-5.7.2 on Windows which crashes the application.
197
+ # https://github.com/giampaolo/psutil/issues/1763
198
+ #
199
+ # files = [f.path for f in proc.open_files()]
200
+ # if self.lastIterIndexWithFiles != -1:
201
+ # if set(files) != set(self.openFiles[self.lastIterIndexWithFiles]):
202
+ # self.openFiles[self.iterIndex] = files
203
+ # self.lastIterIndexWithFiles = self.iterIndex
204
+ # elif files:
205
+ # self.openFiles[self.iterIndex] = files
206
+ # self.lastIterIndexWithFiles = self.iterIndex
213
207
self .iterIndex += 1
214
208
215
209
def toDict (self ):
@@ -234,7 +228,7 @@ def __init__(self):
234
228
self .computer = ComputerStatistics ()
235
229
self .process = ProcStatistics ()
236
230
self .times = []
237
- self .interval = 5
231
+ self .interval = 10 # refresh interval in seconds
238
232
239
233
def update (self , proc ):
240
234
'''
0 commit comments