#!/usr/bin/env python #-*- coding:utf-8 -*- ## ## data.py ## ## Created on: Sep 20, 2017 ## Author: Alexey Ignatiev, Nina Narodytska ## E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com ## # #============================================================================== from __future__ import print_function import collections import itertools import pickle import six import gzip from six.moves import range import numpy as np import pandas as pd # #============================================================================== class Data(object): """ Class for representing data (transactions). """ def __init__(self, data, separator=','): """ Constructor and parser. """ self.names = None self.nm2id = None self.feats = None self.targets = None self.samples = None self.parse(data, separator) def parse(self, data, separator): """ Parse input file. """ # reading data set from file lines = data.split('\n') # reading preamble self.names = [name.replace('"','').strip() for name in lines[0].strip().split(separator)] self.feats = [set([]) for n in self.names[:-1]] self.targets = set([]) lines = lines[1:] # filling name to id mapping self.nm2id = {name: i for i, name in enumerate(self.names)} self.nonbin2bin = {} for name in self.nm2id: spl = name.rsplit(':',1) if (spl[0] not in self.nonbin2bin): self.nonbin2bin[spl[0]] = [name] else: self.nonbin2bin[spl[0]].append(name) # reading training samples self.samples = [] for line, w in six.iteritems(collections.Counter(lines)): inst = [v.strip() for v in line.strip().split(separator)] self.samples.append(inst) for i, v in enumerate(inst[:-1]): if v: self.feats[i].add(str(v)) assert(inst[-1]) self.targets.add(str(inst[-1])) self.nof_feats = len(self.names[:-1]) def mapping_features(self): """ feature-value mapping """ fvmap = {} for i in range(self.nof_feats): fvmap[f'f{i}'] = dict() for j, v in enumerate(sorted(self.feats[i])): fvmap[f'f{i}'][j] = (self.names[i], True, v) if len(self.feats[i]) > 2: m = len(self.feats[i]) for j, v in enumerate(sorted(self.feats[i])): fvmap[f'f{i}'][j+m] = (self.names[i], False, v) return fvmap