-
Caroline de Pourtalès authoredCaroline de Pourtalès authored
utils.py 3.48 KiB
import datetime
import os
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
from tqdm import tqdm
# region load data
def read_links_csv(csv_path, nrows=float('inf'), chunksize=100):
r"""
Preparing csv dataset.
Parameters:
-----------
csv_path:
nrows:
chunksize:
"""
print("\n" + "#" * 20)
print("Loading csv...")
rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1 # minus the header
chunk_list = []
if rows > nrows:
rows = nrows
with tqdm(total=rows, desc='Rows read: ') as bar:
for chunk in pd.read_csv(csv_path, header=0, converters={'Y': pd.eval, 'Z': pd.eval},
chunksize=chunksize, nrows=nrows):
chunk_list.append(chunk)
bar.update(len(chunk))
df = pd.concat((f for f in chunk_list), axis=0)
print("#" * 20)
return df
def read_supertags_csv(csv_path, nrows=float('inf'), chunksize=100):
r"""
Preparing csv dataset.
Parameters:
-----------
csv_path:
nrows:
chunksize:
"""
print("\n" + "#" * 20)
print("Loading csv...")
rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1 # minus the header
chunk_list = []
if rows > nrows:
rows = nrows
with tqdm(total=rows, desc='Rows read: ') as bar:
for chunk in pd.read_csv(csv_path, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, chunksize=chunksize,
nrows=rows):
chunk_list.append(chunk)
bar.update(len(chunk))
df = pd.concat((f for f in chunk_list), axis=0)
print("#" * 20)
return df
def load_obj(name):
with open(name + '.pkl', 'rb') as f:
import pickle
return pickle.load(f)
#endregion
# region format data
def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400):
r"""
Padding sequence for preparation to tensorDataset
sequences: data to pad
batch_first: boolean indicating whether the batch are in first dimension
padding_value: the value for pad
max_len: the maximum length
:return: padding sequences
"""
max_size = sequences[0].size()
trailing_dims = max_size[1:]
if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims
else:
out_dims = (max_len, len(sequences)) + trailing_dims
out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
for i, tensor in enumerate(sequences):
length = tensor.size(0)
# use index notation to prevent duplicate references to the tensor
if batch_first:
out_tensor[i, :length, ...] = tensor
else:
out_tensor[:length, i, ...] = tensor
return out_tensor
#endregion
# region utils training
def output_create_dir():
"""
Create le output dir for tensorboard and checkpoint
@return: output dir, tensorboard writter
"""
from datetime import datetime
outpout_path = 'TensorBoard'
training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
logs_dir = os.path.join(training_dir, 'logs')
writer = SummaryWriter(log_dir=logs_dir)
return training_dir, writer
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round(elapsed))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
#endregion