Upload folder using huggingface_hub

d2f661a verified 6 months ago

5.84 kB

	import os
	import concurrent
	import multiprocessing

	import netCDF4
	import numpy as np

	from ..features.io import load_batch


	def ranks_ensemble(
	observation, forecasts,
	noise_scale=1e-6, rng=None
	):
	shape = observation.shape
	N = np.prod(shape)
	shape_flat = (np.prod(shape),)
	observation = observation.reshape((N,))
	forecasts = forecasts.reshape((N, forecasts.shape[-1]))
	N_threads = multiprocessing.cpu_count()

	max_rank = forecasts.shape[-1]
	bins = np.arange(-0.5, max_rank+0.6)
	ranks_all = np.zeros_like(observation, dtype=np.uint32)

	if rng is None:
	rng = np.random

	def rank_dist_chunk(k):
	i0 = int(round((k/N_threads) * N))
	i1 = int(round(((k+1) / N_threads) * N))
	obs = observation[i0:i1].astype(np.float64, copy=True)
	fc = forecasts[i0:i1,:].astype(np.float64, copy=True)

	# add a tiny amount of noise to forecast to randomize ties
	# (important to add to both obs and fc!)
	obs += (rng.rand(obs.shape) - 0.5) noise_scale
	fc += (rng.rand(fc.shape) - 0.5) noise_scale

	ranks = np.count_nonzero(obs[...,None] >= fc, axis=-1)
	ranks_all[i0:i1] = ranks

	with concurrent.futures.ThreadPoolExecutor(N_threads) as executor:
	futures = {}
	for k in range(N_threads):
	args = (rank_dist_chunk, k)
	futures[executor.submit(*args)] = k
	concurrent.futures.wait(futures)

	return ranks_all.reshape(shape)


	def ranks_multiscale(observation, forecasts):
	obs = observation
	fc = forecasts

	rank_scales = {}
	scale = 1
	while True:
	r = ranks_ensemble(obs, fc)
	rank_scales[scale] = r
	scale *= 2
	if obs.shape[-1] == 1:
	break
	# avg pooling
	obs = 0.25 * (
	obs[...,::2,::2] +
	obs[...,1::2,::2] +
	obs[...,::2,1::2] +
	obs[...,1::2,1::2]
	)
	fc = 0.25 * (
	fc[...,::2,::2,:] +
	fc[...,1::2,::2,:] +
	fc[...,::2,1::2,:] +
	fc[...,1::2,1::2,:]
	)

	return rank_scales


	def rank_distribution(ranks, num_forecasts=32):
	N = np.prod(ranks.shape)
	bins = np.arange(-0.5, num_forecasts+0.6)
	N_threads = multiprocessing.cpu_count()
	ranks = ranks.ravel()

	hist = [None] * N_threads
	def hist_chunk(k):
	i0 = int(round((k/N_threads) * N))
	i1 = int(round(((k+1) / N_threads) * N))
	(h, _) = np.histogram(ranks[i0:i1], bins=bins)
	hist[k] = h

	with concurrent.futures.ThreadPoolExecutor(N_threads) as executor:
	futures = {}
	for k in range(N_threads):
	args = (hist_chunk, k)
	futures[executor.submit(*args)] = k
	concurrent.futures.wait(futures)

	hist = sum(hist)
	return hist / hist.sum()


	def rank_KS(rank_dist, num_forecasts=32):
	h = rank_dist
	h = h / h.sum()
	ch = np.cumsum(h)
	cb = np.linspace(0, 1, len(ch))
	return abs(ch-cb).max()


	def rank_DKL(rank_dist, num_forecasts=32):
	h = rank_dist
	q = h / h.sum()
	p = 1/len(h)
	return p*np.log(p/q).sum()


	def rank_metric_by_leadtime(ranks, metric=None, num_forecasts=32):
	if metric is None:
	metric = rank_DKL

	metric_by_leadtime = []
	for t in range(ranks.shape[2]):
	ranks_time = ranks[:,:,t,...]
	h = rank_distribution(ranks_time)
	m = metric(h, num_forecasts=num_forecasts)
	metric_by_leadtime.append(m)
	return np.array(metric_by_leadtime)


	def rank_metric_by_bin(ranks, values, bins, metric=None, num_forecasts=32):
	if metric is None:
	metric = rank_DKL

	metric_by_bin = []
	for (b0,b1) in zip(bins[:-1],bins[1:]):
	ranks_bin = ranks[(b0 <= values) & (values < b1)]
	h = rank_distribution(ranks_bin)
	m = metric(h, num_forecasts=num_forecasts)
	metric_by_bin.append(m)
	return np.array(metric_by_bin)


	def process_batch(fn, preproc_fc=None):
	print(fn)
	(_, y, y_pred) = load_batch(fn, preproc_fc=preproc_fc)
	return ranks_multiscale(y, y_pred)


	def save_ranks_for_dataset(data_dir, result_fn, preproc_fc=None):
	files = sorted(os.listdir(data_dir))
	files = [os.path.join(data_dir,fn) for fn in files]

	N_threads = multiprocessing.cpu_count()
	futures = []
	with concurrent.futures.ProcessPoolExecutor(N_threads) as executor:
	for fn in files:
	args = (process_batch, fn)
	kwargs = {"preproc_fc": preproc_fc}
	futures.append(executor.submit(args, *kwargs))

	ranks = [f.result() for f in futures]
	scales = sorted(ranks[0].keys())
	ranks = {
	s: np.concatenate([r[s] for r in ranks], axis=0)
	for s in scales
	}

	with netCDF4.Dataset(result_fn, 'w') as ds:
	ds.createDimension("dim_sample", ranks[1].shape[0])
	ds.createDimension("dim_channel", ranks[1].shape[1])
	ds.createDimension("dim_time_future", ranks[1].shape[2])
	var_params = {"zlib": True, "complevel": 1}

	for s in scales:
	ds.createDimension(f"dim_h_pool{s}x{s}", ranks[s].shape[3])
	ds.createDimension(f"dim_w_pool{s}x{s}", ranks[s].shape[4])
	var = ds.createVariable(
	f"ranks_pool{s}x{s}", np.float32,
	(
	"dim_sample", "dim_channel", "dim_time_future",
	f"dim_h_pool{s}x{s}", f"dim_w_pool{s}x{s}",
	),
	chunksizes=(1,)+ranks[s].shape[1:],
	**var_params
	)
	var[:] = ranks[s]