|
1 | 1 | #!/usr/bin/env python
|
2 |
| - |
3 | 2 | from __future__ import print_function
|
| 3 | + |
4 | 4 | import argparse
|
5 | 5 | import json
|
| 6 | +import numpy |
6 | 7 | import os
|
| 8 | +import pandas |
7 | 9 | import pickle
|
8 | 10 |
|
9 | 11 | from chainer.iterators import SerialIterator
|
10 | 12 | from chainer.training.extensions import Evaluator
|
11 |
| -import pandas |
12 | 13 |
|
13 | 14 | try:
|
14 | 15 | import matplotlib
|
|
19 | 20 | from chainer import cuda
|
20 | 21 | from chainer.datasets import split_dataset_random
|
21 | 22 | from chainer import Variable
|
22 |
| -import numpy # NOQA |
23 | 23 |
|
24 | 24 | from chainer_chemistry.dataset.converters import concat_mols
|
25 | 25 | from chainer_chemistry.dataset.preprocessors import preprocess_method_dict
|
|
33 | 33 | from train_qm9 import MeanAbsError, RootMeanSqrError # NOQA
|
34 | 34 |
|
35 | 35 |
|
36 |
| -def main(): |
37 |
| - # Supported preprocessing/network list |
| 36 | +class ScaledGraphConvPredictor(GraphConvPredictor): |
| 37 | + def __init__(self, *args, **kwargs): |
| 38 | + """Initializes the (scaled) graph convolution predictor. This uses |
| 39 | + a standard scaler to rescale the predicted labels. |
| 40 | + """ |
| 41 | + super(ScaledGraphConvPredictor, self).__init__(*args, **kwargs) |
| 42 | + |
| 43 | + def __call__(self, atoms, adjs): |
| 44 | + h = super(ScaledGraphConvPredictor, self).__call__(atoms, adjs) |
| 45 | + scaler_available = hasattr(self, 'scaler') |
| 46 | + numpy_data = isinstance(h.data, numpy.ndarray) |
| 47 | + |
| 48 | + if scaler_available: |
| 49 | + h = self.scaler.inverse_transform(cuda.to_cpu(h.data)) |
| 50 | + if not numpy_data: |
| 51 | + h = cuda.to_gpu(h) |
| 52 | + return Variable(h) |
| 53 | + |
| 54 | + |
| 55 | +def parse_arguments(): |
| 56 | + # Lists of supported preprocessing methods/models. |
38 | 57 | method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn']
|
39 | 58 | label_names = ['A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2',
|
40 | 59 | 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
|
41 | 60 | scale_list = ['standardize', 'none']
|
42 | 61 |
|
43 |
| - parser = argparse.ArgumentParser( |
44 |
| - description='Regression with QM9.') |
| 62 | + # Set up the argument parser. |
| 63 | + parser = argparse.ArgumentParser(description='Regression on QM9.') |
45 | 64 | parser.add_argument('--method', '-m', type=str, choices=method_list,
|
46 |
| - default='nfp') |
| 65 | + help='method name', default='nfp') |
47 | 66 | parser.add_argument('--label', '-l', type=str, choices=label_names,
|
48 |
| - default='', help='target label for regression, ' |
49 |
| - 'empty string means to predict all ' |
50 |
| - 'property at once') |
| 67 | + default='', |
| 68 | + help='target label for regression; empty string means ' |
| 69 | + 'predicting all properties at once') |
51 | 70 | parser.add_argument('--scale', type=str, choices=scale_list,
|
52 |
| - default='standardize', help='Label scaling method') |
53 |
| - parser.add_argument('--batchsize', '-b', type=int, default=32) |
54 |
| - parser.add_argument('--gpu', '-g', type=int, default=-1) |
55 |
| - parser.add_argument('--in-dir', '-i', type=str, default='result') |
56 |
| - parser.add_argument('--seed', '-s', type=int, default=777) |
57 |
| - parser.add_argument('--train-data-ratio', '-t', type=float, default=0.7) |
58 |
| - parser.add_argument('--model-filename', type=str, default='regressor.pkl') |
| 71 | + help='label scaling method', default='standardize') |
| 72 | + parser.add_argument('--gpu', '-g', type=int, default=-1, |
| 73 | + help='id of gpu to use; negative value means running' |
| 74 | + 'the code on cpu') |
| 75 | + parser.add_argument('--seed', '-s', type=int, default=777, |
| 76 | + help='random seed value') |
| 77 | + parser.add_argument('--train-data-ratio', '-r', type=float, default=0.7, |
| 78 | + help='ratio of training data w.r.t the dataset') |
| 79 | + parser.add_argument('--in-dir', '-i', type=str, default='result', |
| 80 | + help='directory to load model data from') |
| 81 | + parser.add_argument('--model-filename', type=str, default='regressor.pkl', |
| 82 | + help='saved model filename') |
59 | 83 | parser.add_argument('--num-data', type=int, default=-1,
|
60 |
| - help='Number of data to be parsed from parser.' |
61 |
| - '-1 indicates to parse all data.') |
62 |
| - args = parser.parse_args() |
| 84 | + help='amount of data to be parsed; -1 indicates ' |
| 85 | + 'parsing all data.') |
| 86 | + return parser.parse_args() |
| 87 | + |
| 88 | + |
63 | 89 |
|
64 |
| - seed = args.seed |
65 |
| - train_data_ratio = args.train_data_ratio |
| 90 | +def main(): |
| 91 | + # Parse the arguments. |
| 92 | + args = parse_arguments() |
| 93 | + |
| 94 | + # Set up some useful variables that will be used later on. |
66 | 95 | method = args.method
|
67 | 96 | if args.label:
|
68 | 97 | labels = args.label
|
69 | 98 | cache_dir = os.path.join('input', '{}_{}'.format(method, labels))
|
70 |
| - # class_num = len(labels) if isinstance(labels, list) else 1 |
71 | 99 | else:
|
72 | 100 | labels = D.get_qm9_label_names()
|
73 | 101 | cache_dir = os.path.join('input', '{}_all'.format(method))
|
74 |
| - # class_num = len(labels) |
75 |
| - |
76 |
| - # Dataset preparation |
77 |
| - dataset = None |
78 | 102 |
|
| 103 | + # Get the filename corresponding to the cached dataset, based on the amount |
| 104 | + # of data samples that need to be parsed from the original dataset. |
79 | 105 | num_data = args.num_data
|
80 | 106 | if num_data >= 0:
|
81 | 107 | dataset_filename = 'data_{}.npz'.format(num_data)
|
82 | 108 | else:
|
83 | 109 | dataset_filename = 'data.npz'
|
84 | 110 |
|
| 111 | + # Load the cached dataset. |
85 | 112 | dataset_cache_path = os.path.join(cache_dir, dataset_filename)
|
| 113 | + |
| 114 | + dataset = None |
86 | 115 | if os.path.exists(dataset_cache_path):
|
87 |
| - print('load from cache {}'.format(dataset_cache_path)) |
| 116 | + print('Loading cached data from {}.'.format(dataset_cache_path)) |
88 | 117 | dataset = NumpyTupleDataset.load(dataset_cache_path)
|
89 | 118 | if dataset is None:
|
90 |
| - print('preprocessing dataset...') |
| 119 | + print('Preprocessing dataset...') |
91 | 120 | preprocessor = preprocess_method_dict[method]()
|
92 | 121 | dataset = D.get_qm9(preprocessor, labels=labels)
|
| 122 | + |
| 123 | + # Cache the newly preprocessed dataset. |
93 | 124 | if not os.path.exists(cache_dir):
|
94 | 125 | os.mkdir(cache_dir)
|
95 | 126 | NumpyTupleDataset.save(dataset_cache_path, dataset)
|
96 | 127 |
|
| 128 | + # Load the standard scaler parameters, if necessary. |
97 | 129 | if args.scale == 'standardize':
|
98 |
| - # Standard Scaler for labels |
99 |
| - with open(os.path.join(args.in_dir, 'ss.pkl'), mode='rb') as f: |
100 |
| - ss = pickle.load(f) |
| 130 | + scaler_path = os.path.join(args.in_dir, 'scaler.pkl') |
| 131 | + print('Loading scaler parameters from {}.'.format(scaler_path)) |
| 132 | + with open(scaler_path, mode='rb') as f: |
| 133 | + scaler = pickle.load(f) |
101 | 134 | else:
|
102 |
| - ss = None |
| 135 | + print('No standard scaling was selected.') |
| 136 | + scaler = None |
| 137 | + |
| 138 | + # Split the dataset into training and testing. |
| 139 | + train_data_size = int(len(dataset) * args.train_data_ratio) |
| 140 | + _, test = split_dataset_random(dataset, train_data_size, args.seed) |
103 | 141 |
|
104 |
| - train_data_size = int(len(dataset) * train_data_ratio) |
105 |
| - train, test = split_dataset_random(dataset, train_data_size, seed) |
| 142 | + # Use a predictor with scaled output labels. |
| 143 | + model_path = os.path.join(args.in_dir, args.model_filename) |
| 144 | + regressor = Regressor.load_pickle(model_path, device=args.gpu) |
106 | 145 |
|
107 |
| - regressor = Regressor.load_pickle( |
108 |
| - os.path.join(args.in_dir, args.model_filename), device=args.gpu) |
| 146 | + # Replace the default predictor with one that scales the output labels. |
| 147 | + scaled_predictor = ScaledGraphConvPredictor(regressor.predictor) |
| 148 | + scaled_predictor.scaler = scaler |
| 149 | + regressor.predictor = scaled_predictor |
109 | 150 |
|
110 |
| - # We need to feed only input features `x` to `predict`/`predict_proba`. |
111 |
| - # This converter extracts only inputs (x1, x2, ...) from the features which |
112 |
| - # consist of input `x` and label `t` (x1, x2, ..., t). |
| 151 | + # This callback function extracts only the inputs and discards the labels. |
113 | 152 | def extract_inputs(batch, device=None):
|
114 | 153 | return concat_mols(batch, device=device)[:-1]
|
115 | 154 |
|
116 |
| - def postprocess_fn(x): |
117 |
| - if ss is not None: |
118 |
| - # Model's output is scaled by StandardScaler, |
119 |
| - # so we need to rescale back. |
120 |
| - if isinstance(x, Variable): |
121 |
| - x = x.data |
122 |
| - scaled_x = ss.inverse_transform(cuda.to_cpu(x)) |
123 |
| - return scaled_x |
124 |
| - else: |
125 |
| - return x |
126 |
| - |
| 155 | + # Predict the output labels. |
127 | 156 | print('Predicting...')
|
128 |
| - y_pred = regressor.predict(test, converter=extract_inputs, |
129 |
| - postprocess_fn=postprocess_fn) |
130 |
| - |
131 |
| - print('y_pred.shape = {}, y_pred[:5, 0] = {}' |
132 |
| - .format(y_pred.shape, y_pred[:5, 0])) |
| 157 | + y_pred = regressor.predict(test, converter=extract_inputs) |
133 | 158 |
|
| 159 | + # Extract the ground-truth labels. |
134 | 160 | t = concat_mols(test, device=-1)[-1]
|
135 | 161 | n_eval = 10
|
136 | 162 |
|
137 |
| - # Construct dataframe |
| 163 | + # Construct dataframe. |
138 | 164 | df_dict = {}
|
139 | 165 | for i, l in enumerate(labels):
|
140 |
| - df_dict.update({ |
141 |
| - 'y_pred_{}'.format(l): y_pred[:, i], |
142 |
| - 't_{}'.format(l): t[:, i], |
143 |
| - }) |
| 166 | + df_dict.update({'y_pred_{}'.format(l): y_pred[:, i], |
| 167 | + 't_{}'.format(l): t[:, i],}) |
144 | 168 | df = pandas.DataFrame(df_dict)
|
145 | 169 |
|
146 |
| - # Show random 5 example's prediction/ground truth table |
| 170 | + # Show a prediction/ground truth table with 5 random examples. |
147 | 171 | print(df.sample(5))
|
148 |
| - |
149 | 172 | for target_label in range(y_pred.shape[1]):
|
150 | 173 | diff = y_pred[:n_eval, target_label] - t[:n_eval, target_label]
|
151 | 174 | print('target_label = {}, y_pred = {}, t = {}, diff = {}'
|
152 | 175 | .format(target_label, y_pred[:n_eval, target_label],
|
153 | 176 | t[:n_eval, target_label], diff))
|
154 | 177 |
|
155 |
| - # --- evaluate --- |
156 |
| - # To calc loss/accuracy, we can use `Evaluator`, `ROCAUCEvaluator` |
| 178 | + # Run an evaluator on the test dataset. |
157 | 179 | print('Evaluating...')
|
158 | 180 | test_iterator = SerialIterator(test, 16, repeat=False, shuffle=False)
|
159 |
| - eval_result = Evaluator( |
160 |
| - test_iterator, regressor, converter=concat_mols, device=args.gpu)() |
| 181 | + eval_result = Evaluator(test_iterator, regressor, converter=concat_mols, |
| 182 | + device=args.gpu)() |
| 183 | + |
| 184 | + # Prevents the loss function from becoming a cupy.core.core.ndarray object |
| 185 | + # when using the GPU. This hack will be removed as soon as the cause of |
| 186 | + # the issue is found and properly fixed. |
| 187 | + loss = numpy.asscalar(cuda.to_cpu(eval_result['main/loss'])) |
| 188 | + eval_result['main/loss'] = loss |
161 | 189 | print('Evaluation result: ', eval_result)
|
162 | 190 |
|
| 191 | + # Save the evaluation results. |
163 | 192 | with open(os.path.join(args.in_dir, 'eval_result.json'), 'w') as f:
|
164 | 193 | json.dump(eval_result, f)
|
165 | 194 |
|
|
0 commit comments