-
Notifications
You must be signed in to change notification settings - Fork 0
/
medpc_clean_subjects.py
171 lines (103 loc) · 4.27 KB
/
medpc_clean_subjects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 14 16:05:33 2022
@author: Dakota
"""
#%% Make all variations of Subject strings consistent
#% Correct variations in subject names (case, punctuation, spaces)
# input = excel sheet, output= 'cleaned' excel sheet
#TODO: ( not actually cleaning raw data file .txts)
#%% Import dependencies
import pandas as pd
import glob
import os
import seaborn as sns
#%% Define data paths for input/output
# root directory where this script is
dataPathRoot= os.getcwd()
# your path to folder containing excel files with MedPC data
dataPathInput= dataPathRoot+'/_medpc_excel_file_overview/_input/'
# output folder, here new folder to save output plots etc
dataPathOutput= dataPathRoot+'/_clean_subjects/_output/'
#output folder for log of files with 'cleaned' subjects
dataPathLogs= dataPathRoot+'/_clean_subjects/_output/_flagged_files/'
#%% TODO:
#- support independent cleaning of multiple excel files ... currently make single combined output at end but probs can just extend allFiles loop
#%% ID and import raw data .xlsx
# set all .xls files in your folder to list
allfiles = glob.glob(dataPathInput + "*.xls*")
#initialize list to store data from each file
dfRaw = pd.DataFrame()
#define columns in your .xlsx for specific variables you want (e.g. A:Z for all, but may be double letters if many variables)
colToImport= 'A:J' #assume first few columns have metadata desired
#for loop to aquire all excel files in folder
for excelfiles in allfiles:
#read all sheets by specifying sheet_name = None
#Remove any variables you don't want now before appending!
raw_excel = pd.read_excel(excelfiles, sheet_name= None, usecols=colToImport)
#append all sheets into single df
for sheet in raw_excel:
dfRaw= dfRaw.append(raw_excel[sheet], ignore_index=True)
#%% Make a column for "Notes" of files where subject is changed
#TODO: consider changing raw data file too
dfRaw['medpc_preprocessing_note']= ''
#%% Clean subject names
subjectsOG= dfRaw.Subject
#make new column for 'cleaned' subjects
dfRaw['SubjectCleaned']= dfRaw.Subject
#-- strip() to remove any extra spaces
subjectStripped= dfRaw.Subject.str.strip()
# make note of files where subjects were changed as result of strip()
ind=[]
ind= dfRaw.Subject!=subjectStripped
# test= dfRaw[ind] #viz
dfRaw.loc[ind,'medpc_preprocessing_note']= dfRaw.medpc_preprocessing_note + '_subjectStripped'
# save new subject to column
dfRaw.loc[ind,'SubjectCleaned']= subjectStripped
#-- correct variations in Case
# simply make all uppercase
subjectsUpper= subjectsOG.str.upper()
# make note of files where subjects changed as result of upper()
ind=[]
ind= dfRaw.Subject!=subjectsUpper
# test= dfRaw[ind] #viz
dfRaw.loc[ind,'medpc_preprocessing_note']= dfRaw.medpc_preprocessing_note + '_subjectUpperCased'
# save new subject to column
dfRaw.loc[ind,'SubjectCleaned']= subjectsUpper
#-- get report of files with cleaned subject
ind= []
# ind= ~dfRaw.medpc_preprocessing_note.isnull()
ind= ~(dfRaw.medpc_preprocessing_note == '')
#%% save flagged files as csv
flagged= dfRaw.loc[ind,:]
import datetime
fileName= '_LOG_Files_subjectsCleaned_'+str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
os.chdir(dataPathLogs)
# flagged.to_csv(strLog)
#save as excel sheet
fileName= fileName+'.xlsx'
flagged.to_excel(fileName)
os.chdir(dataPathRoot)
#%% Overwrite original subjects with 'cleaned' and save new .xlsx
# overwrite OG column and drop new column
dfRaw.loc[:,'Subject']= dfRaw.SubjectCleaned
dfRaw= dfRaw.drop('SubjectCleaned', axis=1)
#keep original excel sheet name, add '_cleaned'
# TODO: this just grabs first excel file name in allFiles
fileName= os.path.basename(allfiles[0])
#remove prior .xslx from filename (TODO: assumes exactly .xlsx)
fileName=fileName[0:-5]
# thisFileName= os.path.basename(fileName)
fileName= fileName+'_cleanedSubjects'
os.chdir(dataPathOutput)
#save as excel sheet
fileName= fileName+'.xlsx'
dfRaw.to_excel(fileName, index=False)
os.chdir(dataPathRoot)
# import datetime
# strLog= '_LOG_Files_subjectsCleaned'+str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) +'.txt'
# with open(strLog, 'w') as f:
# for line in test:
# f.write(f"{line}\n")
# test= subjectsOG.unique()
# test2= dfRaw.SubjectCleaned.unique()