-
Notifications
You must be signed in to change notification settings - Fork 2
/
Data Quality Testing.py
210 lines (172 loc) · 9.51 KB
/
Data Quality Testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
'''---------------README------------------
Full instructions available: https://dataonwheels.wordpress.com/?s=Power+BI%3A+Data+Quality+Checks+Using+Python+%26+SQL
The goal of this script is to compare DAX measures to corresponding SQL queries.
To accomplish this, it uses an excel file containing authentication variables and query values.
This idea came from seeing the Execute Queries REST API in Power BI and wanting to use python to perform our data quality checks efficiently.
To connect to a Power BI Data Model, we need to pass an authentication token generated through an Azure AD App.
To run a DAX query and SQL query, we need to loop through our excel file then put the queries into the API and SQL calls respectively.
Finally, this script takes those query results and puts it back into the excel file.
I have left some print() lines commented out, feel free to uncomment them to troubleshoot or walk through the code step-by-step.
You will need to swap out the excel_file variable with your excel file path. Other than that, the rest of the variables are managed inside your excel file.
Once you have your variables and queries in excel, hit run and you're good to go.
'''
#---------------------------------------#
# Import libraries needed
#---------------------------------------#
import requests
import adal
import json
import pymssql
import pandas
import openpyxl
from win32com.client import Dispatch; import os
#---------------------------------------#
# Build out authentication steps
#---------------------------------------#
#----- Authentication Variables from Excel -----#
#Grab authentication data from our excel file
from openpyxl import load_workbook
#we need the r at the beginning since "\" is a special character in python
excel_file = r"C:\Users\KristynaHughes\Downloads\PythonDataQualityChecker.xlsx"
wb = load_workbook(excel_file)
sheets = wb.sheetnames
#print(sheets) #see the sheets in our workbook
cred_ws = wb['Credentials'] #access specific sheet called Credentials
cred_mapping = {} #this will help us build a dataframe from our table
for entry, data_boundary in cred_ws.tables.items():
#parse the data within the ref boundary
data = cred_ws[data_boundary]
#extract the data
#the inner list comprehension gets the values for each cell in the table
content = [[cell.value for cell in ent] #ent makes our code cleaner by handling nested data sources
for ent in data
]
header = content[0] #the contents excluding the header (aka column names)
rest = content[1:] #create dataframe with the column names
cred_df = pandas.DataFrame(rest, columns = header) #this is the dataframe we will use to get our creds
cred_mapping[entry] = cred_df
#Use the dataframe to set up credential variables we can use later in the script
sqlserver = str(cred_df._get_value(0,"SQL_Server"))
sqldb = str(cred_df._get_value(0,"SQL_Database"))
sqluser = str(cred_df._get_value(0,"SQL_User"))
sqlpassword = str(cred_df._get_value(0,"SQL_Password"))
pbiclientid = str(cred_df._get_value(0,"PBI_ClientID"))
pbiusername = str(cred_df._get_value(0,"PBI_Username"))
pbipassword = str(cred_df._get_value(0,"PBI_Password"))
pbidatasetid = str(cred_df._get_value(0,"PBI_DatasetID"))
#check to make sure your variables are correct by uncommenting the next line
#print(sqlserver,sqldb,sqluser,sqlpassword,pbiclientid,pbiusername,pbipassword,pbidatsetid)
#----- Power BI REST API Authentication -----#
authority_url = 'https://login.windows.net/common'
resource_url = 'https://analysis.windows.net/powerbi/api'
context = adal.AuthenticationContext(authority=authority_url,
validate_authority=True,
api_version=None)
token = context.acquire_token_with_username_password(resource=resource_url,
username=pbiusername,
password=pbipassword,
client_id=pbiclientid)
access_token = token.get('accessToken')
pbiheader = {'Authorization': f'Bearer {access_token}'}
#try out the get workspaces REST API using our access token by uncommenting the next few lines
#get_workspaces_url = 'https://api.powerbi.com/v1.0/myorg/groups'
#r = requests.get(url=get_workspaces_url, headers=header)
#r.text will give us the response text for our get request, pretty neat!
#print(r.text)
#----- SQL Server Authentication -----#
try:
sql_con = pymssql.connect(sqlserver,sqluser,sqlpassword,sqldb)
sql_cursor = sql_con.cursor(as_dict=True)
except Exception as e:
raise Exception(e)
#---------------------------------------#
# Build out data quality check steps
#---------------------------------------#
#----- Read excel to get quality check queries into a dataframe -----#
quality_ws = wb['Quality Check']
quality_mapping = {} #this will help us build a dataframe from our table
for entry, data_boundary in quality_ws.tables.items(): #grabs data dynamically from our table
data = quality_ws[data_boundary] #parse the data within the ref boundary
#the inner list comprehension gets the values for each cell in the table
content = [[cell.value for cell in ent] #ent makes our code cleaner by handling nested data sources
for ent in data
]
header = content[0] #the contents excluding the header (aka column names)
rest = content[1:] #create dataframe with the column names
qualitydf = pandas.DataFrame(rest, columns = header) #this is the dataframe we will use to get our quality check queries
quality_df = qualitydf.fillna(' ') #helps remove blank records from our excel file
quality_mapping[entry] = quality_df
#print(quality_df)
#----- Open excel file in edit mode -----#
xl = Dispatch("Excel.Application") #opens excel for us to edit
xl.Visible = True
edit_wb = xl.Workbooks.Open(excel_file)
edit_qc_sh = edit_wb.Worksheets("Quality Check")
#----- Set variables to use in our iterators -----#
qdf = quality_df.dropna() #removing any blank rows so we don't run blank sql queries lol
sqlrownumber = 1
pbirownumber = -1
#----- Run SQL queries and put results back into excel -----#
sql_queries = qdf.loc[:,"SQL_Query"]
for query in sql_queries:
sql_cursor.execute(str(query))
#print (sql_cursor.fetchall()[0])
for row in sql_cursor:
#print(row) #this is our dictionary created from the sql query result
key = list(row.items())[0] #gets the first item from the row dictionary
sqlrownumber += 1 #puts our sql responses in the right rows
sql_result = key[1] #grabs just the result of our query
quality_df.at[sqlrownumber,"SQL_Result"] = sql_result #this will put our results into the proper cell in our dataframe
edit_qc_sh.Range(f"D{sqlrownumber}").Value = sql_result #this will put our results in the right excel cell
#print(key[1]) #returns just our result values
#----- Run PBI DAX queries and put results back into excel -----#
pbi_queries = quality_df.loc[:,"PBI_DAX_Query"]
#print(pbi_queries)
execute_queries_url = f'https://api.powerbi.com/v1.0/myorg/datasets/{pbidatasetid}/executeQueries'
for items in pbi_queries:
pbirownumber += 1
pbiexcelrownumber = pbirownumber+2
list_pbiquery = list(pbi_queries.items())[pbirownumber]
#print(list_pbiquery)
item_pbiquery = list_pbiquery[1]
dax_query_json_body = {
"queries":
[
{
"query": item_pbiquery
}
]
}
pbi_response = requests.post(url=execute_queries_url, headers=pbiheader,json=dax_query_json_body)
query_json = pbi_response.text.encode().decode('utf-8-sig') #allows us to read the json response
pbi_values = query_json.replace("}]}]}]}","") #drops the trailing encoding from json
#print(query_json)
pbi_result = pbi_values.split(":")[4] #grabs just the result of our query
#print(pbi_result) #grabs just the result of our query
quality_df.at[pbiexcelrownumber,"PBI_Result"] = pbi_result #this will put our results into the proper cell in our dataframe
edit_qc_sh.Range(f"C{pbiexcelrownumber}").Value = pbi_result #this will put our results in the right excel cell
#----- Save our changes back to the excel file -----#
edit_wb.Save() #saving our values back to excel
print("All done")
### Hard-coded examples Of running queries for reference ###
'''
#SQL Script Execution with Hard Coded SQL Query
sql_cursor.execute('SELECT SUM(SalesAmount) as SalesAmount FROM dbo.FactSales')
for row in sql_cursor:
print("SQL Result ", row["SalesAmount"])
#PBI Execute Queries with Hard Coded Excel Query
pbi_dataset_id = pbidatasetid
execute_queries_url = f'https://api.powerbi.com/v1.0/myorg/datasets/{pbi_dataset_id}/executeQueries'
dax_query_json_body = {
"queries":
[
{
"query": 'EVALUATE ROW("Sales_Amount", \'_Measures\'[Sales Amount])'
}
]
}
eq = requests.post(url=execute_queries_url, headers=pbiheader,json=dax_query_json_body)
eq.raise_for_status()
query_json = eq.text.encode().decode('utf-8-sig')
print("JSON Response ", json.loads(query_json))
'''