anneal.py
14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
# TODO: add this to documentation
"""
Annealing algorithm for hyperopt
Annealing is a simple but effective variant on random search that
takes some advantage of a smooth response surface.
The simple (but not overly simple) code of simulated annealing makes this file
a good starting point for implementing new search algorithms.
"""
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
from builtins import zip
from past.utils import old_div
import logging
import numpy as np
from hyperopt.pyll.base import bincount
from .pyll.stochastic import (
categorical,
normal,
lognormal,
qnormal,
qlognormal,
uniform,
loguniform,
quniform,
qloguniform,
)
from .base import miscs_to_idxs_vals
from .algobase import SuggestAlgo, ExprEvaluator
__authors__ = "James Bergstra"
__license__ = "3-clause BSD License"
__contact__ = "github.com/hyperopt/hyperopt"
logger = logging.getLogger(__name__)
class AnnealingAlgo(SuggestAlgo):
"""
This simple annealing algorithm begins by sampling from the prior,
but tends over time to sample from points closer and closer to the best
ones observed.
In addition to the value of this algorithm as a baseline optimization
strategy, it is a simple starting point for implementing new algorithms.
# The Annealing Algorithm
The annealing algorithm is to choose one of the previous trial points
as a starting point, and then to sample each hyperparameter from a similar
distribution to the one specified in the prior, but whose density is more
concentrated around the trial point we selected.
This algorithm is a simple variation on random search that leverages
smoothness in the response surface. The annealing rate is not adaptive.
## Choosing a Best Trial
The algorithm formalizes the notion of "one of the best trials" by
sampling a position from a geometric distribution whose mean is the
`avg_best_idx` parameter. The "best trial" is the trial thus selected
from the set of all trials (`self.trials`).
It may happen that in the process of ancestral sampling, we may find that
the best trial at some ancestral point did not use the hyperparameter we
need to draw. In such a case, this algorithm will draw a new "runner up"
best trial, and use that one as if it had been chosen as the best trial.
The set of best trials, and runner-up best trials obtained during the
process of choosing all hyperparameters is kept sorted by the validation
loss, and at each point where the best trial does not define a
required hyperparameter value, we actually go through all the list of
runners-up too, before giving up and adding a new runner-up trial.
## Concentrating Prior Distributions
To sample a hyperparameter X within a search space, we look at
what kind of hyperparameter it is (what kind of distribution it's from)
and the previous successful values of that hyperparameter, and make
a new proposal for that hyperparameter independently of other
hyperparameters (except technically any choice nodes that led us to use
this current hyperparameter in the first place).
For example, if X is a uniform-distributed hyperparameters drawn from
`U(l, h)`, we look at the value `x` of the hyperparameter in the selected
trial, and draw from a new uniform density `U(x - w/2, x + w/2)`, where w
is related to the initial range, and the number of observations we have for
X so far. If W is the initial range, and T is the number of observations
we have, then w = W / (1 + T * shrink_coef). If the resulting range would
extend either below l or above h, we shift it to fit into the original
bounds.
"""
def __init__(self, domain, trials, seed, avg_best_idx=2.0, shrink_coef=0.1):
"""
Parameters
----------
avg_best_idx: float
Mean of geometric distribution over which trial to explore around,
selecting from trials sorted by score (0 is best)
shrink_coef: float
Rate of reduction in the size of sampling neighborhood as more
points have been explored.
"""
SuggestAlgo.__init__(self, domain, trials, seed=seed)
self.avg_best_idx = avg_best_idx
self.shrink_coef = shrink_coef
doc_by_tid = {}
for doc in trials.trials:
# get either this docs own tid or the one that it's from
tid = doc["tid"]
loss = domain.loss(doc["result"], doc["spec"])
if loss is None:
# -- associate infinite loss to new/running/failed jobs
loss = float("inf")
else:
loss = float(loss)
doc_by_tid[tid] = (doc, loss)
self.tid_docs_losses = sorted(doc_by_tid.items())
self.tids = np.asarray([t for (t, (d, l)) in self.tid_docs_losses])
self.losses = np.asarray([l for (t, (d, l)) in self.tid_docs_losses])
self.tid_losses_dct = dict(list(zip(self.tids, self.losses)))
# node_tids: dict from hp label -> trial ids (tids) using that hyperparam
# node_vals: dict from hp label -> values taken by that hyperparam
self.node_tids, self.node_vals = miscs_to_idxs_vals(
[d["misc"] for (tid, (d, l)) in self.tid_docs_losses],
keys=list(domain.params.keys()),
)
self.best_tids = []
def shrinking(self, label):
"""Return fraction of original search width
Parameters
----------
label: string
the name of a hyperparameter
"""
T = len(self.node_vals[label])
return old_div(1.0, (1.0 + T * self.shrink_coef))
def choose_ltv(self, label, size):
"""Returns (loss, tid, val) of best/runner-up trial
"""
tids = self.node_tids[label]
vals = self.node_vals[label]
losses = [self.tid_losses_dct[tid] for tid in tids]
if size == 1:
# -- try to return the value corresponding to one of the
# trials that was previously chosen (non-independence
# of hyperparameter values)
# This doesn't really make sense if we're sampling a lot of
# points at a time.
tid_set = set(tids)
for tid in self.best_tids:
if tid in tid_set:
idx = tids.index(tid)
rval = losses[idx], tid, vals[idx]
return rval
# -- choose a new good seed point
good_idx = self.rng.geometric(old_div(1.0, self.avg_best_idx), size=size) - 1
good_idx = np.clip(good_idx, 0, len(tids) - 1).astype("int32")
picks = np.argsort(losses)[good_idx]
picks_loss = np.asarray(losses)[picks]
picks_tids = np.asarray(tids)[picks]
picks_vals = np.asarray(vals)[picks]
if size == 1:
self.best_tids.append(int(picks_tids))
return picks_loss, picks_tids, picks_vals
def on_node_hyperparameter(self, memo, node, label):
"""
Return a new value for one hyperparameter.
Parameters:
-----------
memo - a partially-filled dictionary of node -> list-of-values
for the nodes in a vectorized representation of the
original search space.
node - an Apply instance in the vectorized search space,
which corresponds to a hyperparameter
label - a string, the name of the hyperparameter
Returns: a list with one value in it: the suggested value for this
hyperparameter
Notes
-----
This function works by delegating to self.hp_HPTYPE functions to
handle each of the kinds of hyperparameters in hyperopt.pyll_utils.
Other search algorithms can implement this function without
delegating based on the hyperparameter type, but it's a pattern
I've used a few times so I show it here.
"""
n_observations = len(self.node_vals[label])
if n_observations > 0:
# -- Pick a previous trial on which to base the new sample
size = memo[node.arg["size"]]
loss, tid, val = self.choose_ltv(label, size=size)
try:
handler = getattr(self, "hp_%s" % node.name)
except AttributeError:
raise NotImplementedError("Annealing", node.name)
return handler(memo, node, label, tid, val)
else:
# -- Draw the new sample from the prior
return ExprEvaluator.on_node(self, memo, node)
def hp_uniform(
self,
memo,
node,
label,
tid,
val,
log_scale=False,
pass_q=False,
uniform_like=uniform,
):
"""
Return a new value for a uniform hyperparameter.
Parameters:
-----------
memo - (see on_node_hyperparameter)
node - (see on_node_hyperparameter)
label - (see on_node_hyperparameter)
tid - trial-identifier of the model trial on which to base a new sample
val - the value of this hyperparameter on the model trial
Returns: a list with one value in it: the suggested value for this
hyperparameter
"""
if log_scale:
midpt = np.log(val)
else:
midpt = val
high = memo[node.arg["high"]]
low = memo[node.arg["low"]]
width = (high - low) * self.shrinking(label)
half = 0.5 * width
min_midpt = low + half
max_midpt = high - half
clipped_midpt = np.clip(midpt, min_midpt, max_midpt)
if pass_q:
return uniform_like(
low=clipped_midpt - half,
high=clipped_midpt + half,
rng=self.rng,
q=memo[node.arg["q"]],
size=memo[node.arg["size"]],
)
else:
return uniform_like(
low=clipped_midpt - half,
high=clipped_midpt + half,
rng=self.rng,
size=memo[node.arg["size"]],
)
def hp_quniform(self, *args, **kwargs):
return self.hp_uniform(pass_q=True, uniform_like=quniform, *args, **kwargs)
def hp_loguniform(self, *args, **kwargs):
return self.hp_uniform(
log_scale=True, pass_q=False, uniform_like=loguniform, *args, **kwargs
)
def hp_qloguniform(self, *args, **kwargs):
return self.hp_uniform(
log_scale=True, pass_q=True, uniform_like=qloguniform, *args, **kwargs
)
def hp_randint(self, memo, node, label, tid, val):
"""
Parameters: See `hp_uniform`
"""
low = memo[node.arg["low"]]
high = memo.get(node.arg["high"])
# if high is None, the domain is [0, low), else it is [low, high)
domain_size = low if high is None else high - low
offset = 0 if high is None else low
val1 = np.atleast_1d(val)
if val1.size:
counts = old_div(
bincount(val1, offset=offset, minlength=domain_size), float(val1.size)
)
else:
counts = np.zeros(domain_size)
prior = self.shrinking(label)
p = (1 - prior) * counts + prior * (old_div(1.0, domain_size))
rval = categorical(p=p, rng=self.rng, size=memo[node.arg["size"]]) + offset
return rval
def hp_categorical(self, memo, node, label, tid, val):
"""
Parameters: See `hp_uniform`
"""
size = memo[node.arg["size"]]
if size == 0:
return []
val1 = np.atleast_1d(val)
p = p_orig = np.asarray(memo[node.arg["p"]])
if p.ndim == 2:
if len(p) not in (1, len(val1)):
print(node)
print(p)
print(np.asarray(p).shape)
assert len(p) in (1, len(val1))
else:
assert p.ndim == 1
p = p[np.newaxis, :]
if val1.size:
counts = old_div(np.bincount(val1, minlength=p.size), float(val1.size))
prior = self.shrinking(label)
else:
counts = np.zeros(p.size)
prior = 1.0
new_p = (1 - prior) * counts + prior * p
assert new_p.ndim == 2
rval = categorical(p=new_p, rng=self.rng, size=size)
if p_orig.ndim == 1:
assert len(rval) == 1
return rval[0]
else:
return rval
def hp_normal(self, memo, node, label, tid, val):
"""
Parameters: See `hp_uniform`
"""
return normal(
mu=val,
sigma=memo[node.arg["sigma"]] * self.shrinking(label),
rng=self.rng,
size=memo[node.arg["size"]],
)
def hp_lognormal(self, memo, node, label, tid, val):
"""
Parameters: See `hp_uniform`
"""
return lognormal(
mu=np.log(val),
sigma=memo[node.arg["sigma"]] * self.shrinking(label),
rng=self.rng,
size=memo[node.arg["size"]],
)
def hp_qlognormal(self, memo, node, label, tid, val):
"""
Parameters: See `hp_uniform`
"""
return qlognormal(
# -- prevent log(0) without messing up algo
mu=np.log(1e-16 + val),
sigma=memo[node.arg["sigma"]] * self.shrinking(label),
q=memo[node.arg["q"]],
rng=self.rng,
size=memo[node.arg["size"]],
)
def hp_qnormal(self, memo, node, label, tid, val):
"""
Parameters: See `hp_uniform`
"""
return qnormal(
mu=val,
sigma=memo[node.arg["sigma"]] * self.shrinking(label),
q=memo[node.arg["q"]],
rng=self.rng,
size=memo[node.arg["size"]],
)
def suggest(new_ids, domain, trials, seed, *args, **kwargs):
(new_id,) = new_ids
return AnnealingAlgo(domain, trials, seed, *args, **kwargs)(new_id)
def suggest_batch(new_ids, domain, trials, seed, *args, **kwargs):
return AnnealingAlgo(domain, trials, seed, *args, **kwargs).batch(new_ids)
# -- flake-8 abhors blank line EOF