Skip to content
Snippets Groups Projects
utils.py 3.48 KiB
import datetime
import os
from torch.utils.tensorboard import SummaryWriter

import pandas as pd
from tqdm import tqdm

# region load data

def read_links_csv(csv_path, nrows=float('inf'), chunksize=100):
    r"""
    Preparing csv dataset.

    Parameters:
    -----------
        csv_path:
        nrows:
        chunksize:
    """
    print("\n" + "#" * 20)
    print("Loading csv...")

    rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1  # minus the header
    chunk_list = []

    if rows > nrows:
        rows = nrows

    with tqdm(total=rows, desc='Rows read: ') as bar:
        for chunk in pd.read_csv(csv_path, header=0, converters={'Y': pd.eval, 'Z': pd.eval}, 
                                chunksize=chunksize, nrows=nrows):
            chunk_list.append(chunk)
            bar.update(len(chunk))

    df = pd.concat((f for f in chunk_list), axis=0)
    print("#" * 20)
    return df

def read_supertags_csv(csv_path, nrows=float('inf'), chunksize=100):
    r"""
    Preparing csv dataset.

    Parameters:
    -----------
        csv_path:
        nrows:
        chunksize:
    """
    print("\n" + "#" * 20)
    print("Loading csv...")

    rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1  # minus the header
    chunk_list = []

    if rows > nrows:
        rows = nrows

    with tqdm(total=rows, desc='Rows read: ') as bar:
        for chunk in pd.read_csv(csv_path, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, chunksize=chunksize,
                                 nrows=rows):
            chunk_list.append(chunk)
            bar.update(len(chunk))

    df = pd.concat((f for f in chunk_list), axis=0)
    print("#" * 20)
    return df


def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        import pickle
        return pickle.load(f)

#endregion

# region format data

def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400):
    r"""
    Padding sequence for preparation to tensorDataset
     sequences: data to pad
     batch_first: boolean indicating whether the batch are in first dimension
     padding_value: the value for pad
     max_len: the maximum length
    :return: padding sequences
    """
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims

    out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
    for i, tensor in enumerate(sequences):
        length = tensor.size(0)
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            out_tensor[i, :length, ...] = tensor
        else:
            out_tensor[:length, i, ...] = tensor

    return out_tensor

#endregion

# region utils training

def output_create_dir():
    """
    Create le output dir for tensorboard and checkpoint
    @return: output dir, tensorboard writter
    """
    from datetime import datetime
    outpout_path = 'TensorBoard'
    training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
    logs_dir = os.path.join(training_dir, 'logs')
    writer = SummaryWriter(log_dir=logs_dir)
    return training_dir, writer


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round(elapsed))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

#endregion