Skip to content

Commit c39367f

Browse files
authored
Merge pull request #256 from mihaimorariu/devel/qm9_example
Scale output labels in the QM9 example and refactor code.
2 parents dc99ce7 + 3787c0c commit c39367f

File tree

4 files changed

+325
-212
lines changed

4 files changed

+325
-212
lines changed

examples/qm9/evaluate_models_qm9.sh

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,17 @@ gpu=${2:--1}
88

99
for method in ${methods[@]}
1010
do
11-
result_dir=${prefix}${method}
12-
python train_qm9.py --method ${method} --gpu ${gpu} --out ${result_dir} --epoch ${epoch}
13-
python predict_qm9.py --in-dir ${result_dir} --method ${method}
11+
result_dir=${prefix}${method}
12+
13+
python train_qm9.py \
14+
--method ${method} \
15+
--gpu ${gpu} \
16+
--out ${result_dir} \
17+
--epoch ${epoch}
18+
19+
python predict_qm9.py \
20+
--in-dir ${result_dir} \
21+
--method ${method}
1422
done
1523

1624
python plot.py --prefix ${prefix} --methods ${methods[@]}

examples/qm9/predict_qm9.py

Lines changed: 97 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
#!/usr/bin/env python
2-
32
from __future__ import print_function
3+
44
import argparse
55
import json
6+
import numpy
67
import os
8+
import pandas
79
import pickle
810

911
from chainer.iterators import SerialIterator
1012
from chainer.training.extensions import Evaluator
11-
import pandas
1213

1314
try:
1415
import matplotlib
@@ -19,7 +20,6 @@
1920
from chainer import cuda
2021
from chainer.datasets import split_dataset_random
2122
from chainer import Variable
22-
import numpy # NOQA
2323

2424
from chainer_chemistry.dataset.converters import concat_mols
2525
from chainer_chemistry.dataset.preprocessors import preprocess_method_dict
@@ -33,133 +33,162 @@
3333
from train_qm9 import MeanAbsError, RootMeanSqrError # NOQA
3434

3535

36-
def main():
37-
# Supported preprocessing/network list
36+
class ScaledGraphConvPredictor(GraphConvPredictor):
37+
def __init__(self, *args, **kwargs):
38+
"""Initializes the (scaled) graph convolution predictor. This uses
39+
a standard scaler to rescale the predicted labels.
40+
"""
41+
super(ScaledGraphConvPredictor, self).__init__(*args, **kwargs)
42+
43+
def __call__(self, atoms, adjs):
44+
h = super(ScaledGraphConvPredictor, self).__call__(atoms, adjs)
45+
scaler_available = hasattr(self, 'scaler')
46+
numpy_data = isinstance(h.data, numpy.ndarray)
47+
48+
if scaler_available:
49+
h = self.scaler.inverse_transform(cuda.to_cpu(h.data))
50+
if not numpy_data:
51+
h = cuda.to_gpu(h)
52+
return Variable(h)
53+
54+
55+
def parse_arguments():
56+
# Lists of supported preprocessing methods/models.
3857
method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
3958
label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2',
4059
'zpve', 'U0', 'U', 'H', 'G', 'Cv']
4160
scale_list = ['standardize', 'none']
4261

43-
parser = argparse.ArgumentParser(
44-
description='Regression with QM9.')
62+
# Set up the argument parser.
63+
parser = argparse.ArgumentParser(description='Regression on QM9.')
4564
parser.add_argument('--method', '-m', type=str, choices=method_list,
46-
default='nfp')
65+
help='method name', default='nfp')
4766
parser.add_argument('--label', '-l', type=str, choices=label_names,
48-
default='', help='target label for regression, '
49-
'empty string means to predict all '
50-
'property at once')
67+
default='',
68+
help='target label for regression; empty string means '
69+
'predicting all properties at once')
5170
parser.add_argument('--scale', type=str, choices=scale_list,
52-
default='standardize', help='Label scaling method')
53-
parser.add_argument('--batchsize', '-b', type=int, default=32)
54-
parser.add_argument('--gpu', '-g', type=int, default=-1)
55-
parser.add_argument('--in-dir', '-i', type=str, default='result')
56-
parser.add_argument('--seed', '-s', type=int, default=777)
57-
parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7)
58-
parser.add_argument('--model-filename', type=str, default='regressor.pkl')
71+
help='label scaling method', default='standardize')
72+
parser.add_argument('--gpu', '-g', type=int, default=-1,
73+
help='id of gpu to use; negative value means running'
74+
'the code on cpu')
75+
parser.add_argument('--seed', '-s', type=int, default=777,
76+
help='random seed value')
77+
parser.add_argument('--train-data-ratio', '-r', type=float, default=0.7,
78+
help='ratio of training data w.r.t the dataset')
79+
parser.add_argument('--in-dir', '-i', type=str, default='result',
80+
help='directory to load model data from')
81+
parser.add_argument('--model-filename', type=str, default='regressor.pkl',
82+
help='saved model filename')
5983
parser.add_argument('--num-data', type=int, default=-1,
60-
help='Number of data to be parsed from parser.'
61-
'-1 indicates to parse all data.')
62-
args = parser.parse_args()
84+
help='amount of data to be parsed; -1 indicates '
85+
'parsing all data.')
86+
return parser.parse_args()
87+
88+
6389

64-
seed = args.seed
65-
train_data_ratio = args.train_data_ratio
90+
def main():
91+
# Parse the arguments.
92+
args = parse_arguments()
93+
94+
# Set up some useful variables that will be used later on.
6695
method = args.method
6796
if args.label:
6897
labels = args.label
6998
cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
70-
# class_num = len(labels) if isinstance(labels, list) else 1
7199
else:
72100
labels = D.get_qm9_label_names()
73101
cache_dir = os.path.join('input', '{}_all'.format(method))
74-
# class_num = len(labels)
75-
76-
# Dataset preparation
77-
dataset = None
78102

103+
# Get the filename corresponding to the cached dataset, based on the amount
104+
# of data samples that need to be parsed from the original dataset.
79105
num_data = args.num_data
80106
if num_data >= 0:
81107
dataset_filename = 'data_{}.npz'.format(num_data)
82108
else:
83109
dataset_filename = 'data.npz'
84110

111+
# Load the cached dataset.
85112
dataset_cache_path = os.path.join(cache_dir, dataset_filename)
113+
114+
dataset = None
86115
if os.path.exists(dataset_cache_path):
87-
print('load from cache {}'.format(dataset_cache_path))
116+
print('Loading cached data from {}.'.format(dataset_cache_path))
88117
dataset = NumpyTupleDataset.load(dataset_cache_path)
89118
if dataset is None:
90-
print('preprocessing dataset...')
119+
print('Preprocessing dataset...')
91120
preprocessor = preprocess_method_dict[method]()
92121
dataset = D.get_qm9(preprocessor, labels=labels)
122+
123+
# Cache the newly preprocessed dataset.
93124
if not os.path.exists(cache_dir):
94125
os.mkdir(cache_dir)
95126
NumpyTupleDataset.save(dataset_cache_path, dataset)
96127

128+
# Load the standard scaler parameters, if necessary.
97129
if args.scale == 'standardize':
98-
# Standard Scaler for labels
99-
with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f:
100-
ss = pickle.load(f)
130+
scaler_path = os.path.join(args.in_dir, 'scaler.pkl')
131+
print('Loading scaler parameters from {}.'.format(scaler_path))
132+
with open(scaler_path, mode='rb') as f:
133+
scaler = pickle.load(f)
101134
else:
102-
ss = None
135+
print('No standard scaling was selected.')
136+
scaler = None
137+
138+
# Split the dataset into training and testing.
139+
train_data_size = int(len(dataset) * args.train_data_ratio)
140+
_, test = split_dataset_random(dataset, train_data_size, args.seed)
103141

104-
train_data_size = int(len(dataset) * train_data_ratio)
105-
train, test = split_dataset_random(dataset, train_data_size, seed)
142+
# Use a predictor with scaled output labels.
143+
model_path = os.path.join(args.in_dir, args.model_filename)
144+
regressor = Regressor.load_pickle(model_path, device=args.gpu)
106145

107-
regressor = Regressor.load_pickle(
108-
os.path.join(args.in_dir, args.model_filename), device=args.gpu)
146+
# Replace the default predictor with one that scales the output labels.
147+
scaled_predictor = ScaledGraphConvPredictor(regressor.predictor)
148+
scaled_predictor.scaler = scaler
149+
regressor.predictor = scaled_predictor
109150

110-
# We need to feed only input features `x` to `predict`/`predict_proba`.
111-
# This converter extracts only inputs (x1, x2, ...) from the features which
112-
# consist of input `x` and label `t` (x1, x2, ..., t).
151+
# This callback function extracts only the inputs and discards the labels.
113152
def extract_inputs(batch, device=None):
114153
return concat_mols(batch, device=device)[:-1]
115154

116-
def postprocess_fn(x):
117-
if ss is not None:
118-
# Model's output is scaled by StandardScaler,
119-
# so we need to rescale back.
120-
if isinstance(x, Variable):
121-
x = x.data
122-
scaled_x = ss.inverse_transform(cuda.to_cpu(x))
123-
return scaled_x
124-
else:
125-
return x
126-
155+
# Predict the output labels.
127156
print('Predicting...')
128-
y_pred = regressor.predict(test, converter=extract_inputs,
129-
postprocess_fn=postprocess_fn)
130-
131-
print('y_pred.shape = {}, y_pred[:5, 0] = {}'
132-
.format(y_pred.shape, y_pred[:5, 0]))
157+
y_pred = regressor.predict(test, converter=extract_inputs)
133158

159+
# Extract the ground-truth labels.
134160
t = concat_mols(test, device=-1)[-1]
135161
n_eval = 10
136162

137-
# Construct dataframe
163+
# Construct dataframe.
138164
df_dict = {}
139165
for i, l in enumerate(labels):
140-
df_dict.update({
141-
'y_pred_{}'.format(l): y_pred[:, i],
142-
't_{}'.format(l): t[:, i],
143-
})
166+
df_dict.update({'y_pred_{}'.format(l): y_pred[:, i],
167+
't_{}'.format(l): t[:, i],})
144168
df = pandas.DataFrame(df_dict)
145169

146-
# Show random 5 example's prediction/ground truth table
170+
# Show a prediction/ground truth table with 5 random examples.
147171
print(df.sample(5))
148-
149172
for target_label in range(y_pred.shape[1]):
150173
diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
151174
print('target_label = {}, y_pred = {}, t = {}, diff = {}'
152175
.format(target_label, y_pred[:n_eval, target_label],
153176
t[:n_eval, target_label], diff))
154177

155-
# --- evaluate ---
156-
# To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator`
178+
# Run an evaluator on the test dataset.
157179
print('Evaluating...')
158180
test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
159-
eval_result = Evaluator(
160-
test_iterator, regressor, converter=concat_mols, device=args.gpu)()
181+
eval_result = Evaluator(test_iterator, regressor, converter=concat_mols,
182+
device=args.gpu)()
183+
184+
# Prevents the loss function from becoming a cupy.core.core.ndarray object
185+
# when using the GPU. This hack will be removed as soon as the cause of
186+
# the issue is found and properly fixed.
187+
loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss']))
188+
eval_result['main/loss'] = loss
161189
print('Evaluation result: ', eval_result)
162190

191+
# Save the evaluation results.
163192
with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
164193
json.dump(eval_result, f)
165194

examples/qm9/test_qm9.sh

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,47 @@
22

33
set -e
44

5-
# gpu id given from first argument, default value is -1
6-
gpu=${1:--1}
5+
# List of available graph convolution methods.
6+
methods=(nfp ggnn schnet weavenet rsgcn)
7+
# Number of training epochs (default: 1).
8+
epoch=${1:-1}
9+
# GPU identifier; set it to -1 to train on the CPU (default).
10+
gpu=${2:--1}
711

8-
for method in nfp ggnn schnet weavenet rsgcn
12+
for method in ${methods[@]}
913
do
10-
# QM9
11-
if [ ! -f "input" ]; then
12-
rm -rf input
13-
fi
14+
# Remove any previously cached models.
15+
[ -d "input" ] && rm -rf input
1416

15-
python train_qm9.py --method ${method} --label A --conv-layers 1 --gpu ${gpu} --epoch 1 --unit-num 10 --batchsize 32 --num-data 100
16-
python predict_qm9.py --method ${method} --label A --gpu ${gpu} --batchsize 32 --num-data 100
17-
python train_qm9.py --method ${method} --conv-layers 1 --gpu ${gpu} --epoch 1 --unit-num 10 --batchsize 32 --num-data 100
18-
python predict_qm9.py --method ${method} --gpu ${gpu} --batchsize 32 --num-data 100
17+
# Train with the current method (one label).
18+
python train_qm9.py \
19+
--method ${method} \
20+
--label A \
21+
--conv-layers 1 \
22+
--gpu ${gpu} \
23+
--epoch ${epoch} \
24+
--unit-num 10 \
25+
--num-data 100
26+
27+
# Predict with the current method (one label).
28+
python predict_qm9.py \
29+
--method ${method} \
30+
--label A \
31+
--gpu ${gpu} \
32+
--num-data 100
33+
34+
# Train with the current method (all labels).
35+
python train_qm9.py \
36+
--method ${method} \
37+
--conv-layers 1 \
38+
--gpu ${gpu} \
39+
--epoch ${epoch} \
40+
--unit-num 10 \
41+
--num-data 100
42+
43+
# Predict with the current method (all labels).
44+
python predict_qm9.py \
45+
--method ${method} \
46+
--gpu ${gpu} \
47+
--num-data 100
1948
done

0 commit comments

Comments
 (0)