-
Notifications
You must be signed in to change notification settings - Fork 2
/
redact_numerics.py
26 lines (21 loc) · 1.29 KB
/
redact_numerics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import re
import pandas as pd
# To find and replace all sensitive numbers (phone numbers, ssn, latitude and longitude coordinates, some zipcodes)
# in the data frame by xxx we can create a function such as below:
def sensitive_numerics(number):
number = re.sub(r'[\+\(]?\d[\d .\-\(\)]{6,}', r'xxx', number)
# [0-9] matches a single digit in the range 0 through 9 (inclusive), {4} indicates that four such digits should occur in a row,
# - means a hyphen, and | means an OR and separates the two patterns you mention.
# '123-4-5648' '1-234-5-6789'
number = re.sub('[0-9]-[0-9]{3}-[0-9]-[0-9]{4}|[0-9]{3}-[0-9]-[0-9]{4}', 'xxx', number)
return number
def numerics(data):
return pd.Series(data.apply(sensitive_numerics))
############################################# Test
##################################################
# Texts = ['1231451469', '42.2', '123 145 1469', '123.145.1469', '(123) 145.1469', '(123) 145 1469',
# '(123) 145–1469', '123–145–1469', '+1(123) 145–1469 ', '1234567890999111', '123HELLO56',
# '-123', '04/04/1998', 'it’s015–96–0342 you know my number call me', '+123–145–1469',
# '48236–123', 'I live close to (42.293564, -83.638916)', '123-4-5648', '1-234-5-6789']
# data = pd.Series(Texts)
# print(numerics(data))