-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataset2.py
32 lines (22 loc) · 880 Bytes
/
dataset2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import pandas as pd
import dataset
src = dataset.uci_root / 'adult'
dst = dataset.data_folder / 'dataset2'
if __name__ == '__main__':
dataset.get(src / 'adult.data', dst / 'adult.data')
dataset.get(src / 'adult.test', dst / 'adult.test')
dataset.get(src / 'adult.names', dst / 'adult.names')
df = (pd.concat([pd.read_csv(dst / 'adult.data', header=None),
pd.read_csv(dst / 'adult.test', header=None, skiprows=1)]))
higher_ed = {' Assoc-acdm',
' Assoc-voc',
' Bachelors',
' Doctorate',
' Masters',
' Some-college'}
high_income = {' >50K',
' >50K.'}
X = pd.get_dummies(df[[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]])
t = df[3].isin(higher_ed)
y = df[14].isin(high_income)
dataset.save('dataset2', X, t, y)