-
Caroline DE POURTALES authoredCaroline DE POURTALES authored
data.py 2.75 KiB
#!/usr/bin/env python
#-*- coding:utf-8 -*-
##
## data.py
##
## Created on: Sep 20, 2017
## Author: Alexey Ignatiev, Nina Narodytska
## E-mail: aignatiev@ciencias.ulisboa.pt, narodytska@vmware.com
##
#
#==============================================================================
from __future__ import print_function
import collections
import itertools
import pickle
import six
import gzip
from six.moves import range
import numpy as np
import pandas as pd
#
#==============================================================================
class Data(object):
"""
Class for representing data (transactions).
"""
def __init__(self, data, separator=','):
"""
Constructor and parser.
"""
self.names = None
self.nm2id = None
self.feats = None
self.targets = None
self.samples = None
self.parse(data, separator)
def parse(self, data, separator):
"""
Parse input file.
"""
# reading data set from file
lines = data.split('\n')
# reading preamble
self.names = [name.replace('"','').strip() for name in lines[0].strip().split(separator)]
self.feats = [set([]) for n in self.names[:-1]]
self.targets = set([])
lines = lines[1:]
# filling name to id mapping
self.nm2id = {name: i for i, name in enumerate(self.names)}
self.nonbin2bin = {}
for name in self.nm2id:
spl = name.rsplit(':',1)
if (spl[0] not in self.nonbin2bin):
self.nonbin2bin[spl[0]] = [name]
else:
self.nonbin2bin[spl[0]].append(name)
# reading training samples
self.samples = []
for line, w in six.iteritems(collections.Counter(lines)):
inst = [v.strip() for v in line.strip().split(separator)]
self.samples.append(inst)
for i, v in enumerate(inst[:-1]):
if v:
self.feats[i].add(str(v))
assert(inst[-1])
self.targets.add(str(inst[-1]))
self.nof_feats = len(self.names[:-1])
def mapping_features(self):
"""
feature-value mapping
"""
fvmap = {}
for i in range(self.nof_feats):
fvmap[f'f{i}'] = dict()
for j, v in enumerate(sorted(self.feats[i])):
fvmap[f'f{i}'][j] = (self.names[i], True, v)
if len(self.feats[i]) > 2:
m = len(self.feats[i])
for j, v in enumerate(sorted(self.feats[i])):
fvmap[f'f{i}'][j+m] = (self.names[i], False, v)
return fvmap