📚 The CoCalc Library - books, templates and other resources
License: OTHER
import os1import numpy as np2import pandas as pd3from glob import glob4import tarfile5import urllib.request6import zipfile78here = os.path.dirname(__file__)910data_dir = os.path.abspath(os.path.join(here, 'data'))11if not os.path.exists(data_dir):12raise OSError('data/ directory not found, aborting data preparation. ' \13'Restore it with "git checkout data" from the base ' \14'directory.')151617def flights():18flights_raw = os.path.join(data_dir, 'nycflights.tar.gz')19flightdir = os.path.join(data_dir, 'nycflights')20jsondir = os.path.join(data_dir, 'flightjson')2122if not os.path.exists(flights_raw):23print("- Downloading NYC Flights dataset... ", end='', flush=True)24url = "https://storage.googleapis.com/dask-tutorial-data/nycflights.tar.gz"25urllib.request.urlretrieve(url, flights_raw)26print("done", flush=True)2728if not os.path.exists(flightdir):29print("- Extracting flight data... ", end='', flush=True)30tar_path = os.path.join(data_dir, 'nycflights.tar.gz')31with tarfile.open(tar_path, mode='r:gz') as flights:32flights.extractall('data/')33print("done", flush=True)3435if not os.path.exists(jsondir):36print("- Creating json data... ", end='', flush=True)37os.mkdir(jsondir)38for path in glob(os.path.join(data_dir, 'nycflights', '*.csv')):39prefix = os.path.splitext(os.path.basename(path))[0]40# Just take the first 10000 rows for the demo41df = pd.read_csv(path).iloc[:10000]42df.to_json(os.path.join(data_dir, 'flightjson', prefix + '.json'),43orient='records', lines=True)44print("done", flush=True)4546print("** Finished! **")4748def random_array():49if os.path.exists(os.path.join(data_dir, 'random.hdf5')):50return5152print("Create random data for array exercise")53import h5py5455with h5py.File(os.path.join(data_dir, 'random.hdf5')) as f:56dset = f.create_dataset('/x', shape=(1000000000,), dtype='f4')57for i in range(0, 1000000000, 1000000):58dset[i: i + 1000000] = np.random.exponential(size=1000000)596061def accounts_csvs(num_files, n, k):62from accounts import account_entries, account_params63fn = os.path.join(data_dir, 'accounts.%d.csv' % (num_files - 1))6465if os.path.exists(fn):66return6768print("Create CSV accounts for dataframe exercise")6970args = account_params(k)7172for i in range(num_files):73df = account_entries(n, *args)74df.to_csv(os.path.join(data_dir, 'accounts.%d.csv' % i),75index=False)767778def accounts_json(num_files, n, k):79from accounts import account_params, json_entries80import json81import gzip82fn = os.path.join(data_dir, 'accounts.%02d.json.gz' % (num_files - 1))83if os.path.exists(fn):84return8586print("Create JSON accounts for bag exercise")8788args = account_params(k)8990for i in range(num_files):91seq = json_entries(n, *args)92fn = os.path.join(data_dir, 'accounts.%02d.json.gz' % i)93with gzip.open(fn, 'wb') as f:94f.write(os.linesep.join(map(json.dumps, seq)).encode())959697def create_weather(growth=32):98filenames = sorted(glob(os.path.join(data_dir, 'weather-small', '*.hdf5')))99100if not filenames:101ws_dir = os.path.join(data_dir, 'weather-small')102raise ValueError('Did not find any hdf5 files in {}'.format(ws_dir))103104if not os.path.exists(os.path.join(data_dir, 'weather-big')):105os.mkdir(os.path.join(data_dir, 'weather-big'))106107if all(os.path.exists(fn.replace('small', 'big')) for fn in filenames):108return109110from skimage.transform import resize111import h5py112113print('Exploding weather data')114for fn in filenames:115with h5py.File(fn, mode='r') as f:116x = f['/t2m'][:]117118y = resize(x, (x.shape[0] * 32, x.shape[1] * 32), mode='constant')119120out_fn = os.path.join(data_dir, 'weather-big', os.path.split(fn)[-1])121122try:123with h5py.File(out_fn) as f:124f.create_dataset('/t2m', data=y, chunks=(500, 500))125except:126pass127128129if __name__ == '__main__':130import argparse131132parser = argparse.ArgumentParser(description='Downloads, generates and prepares data for the Dask tutorial.')133parser.add_argument('--no-ssl-verify', dest='no_ssl_verify', action='store_true',134default=False, help='Disables SSL verification.')135136args = parser.parse_args()137138if (args.no_ssl_verify):139print("- Disabling SSL Verification... ", end='', flush=True)140import ssl141ssl._create_default_https_context = ssl._create_unverified_context142print("done", flush=True)143144random_array()145create_weather()146accounts_csvs(3, 1000000, 500)147accounts_json(50, 100000, 500)148flights()149150151