-
Notifications
You must be signed in to change notification settings - Fork 4.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2173563
commit 7b190b1
Showing
1 changed file
with
372 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,372 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = pd.read_fwf('data/headlines.txt')\n", | ||
"df.columns = ['headline']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 29, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>headline</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <td>0</td>\n", | ||
" <td>Could Zika Reach New York City?</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>1</td>\n", | ||
" <td>First Case of Zika in Miami Beach</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>2</td>\n", | ||
" <td>Mystery Virus Spreads in Recife, Brazil</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>3</td>\n", | ||
" <td>Dallas man comes down with case of Zika</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>4</td>\n", | ||
" <td>Trinidad confirms first Zika case</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" headline\n", | ||
"0 Could Zika Reach New York City?\n", | ||
"1 First Case of Zika in Miami Beach\n", | ||
"2 Mystery Virus Spreads in Recife, Brazil\n", | ||
"3 Dallas man comes down with case of Zika\n", | ||
"4 Trinidad confirms first Zika case" | ||
] | ||
}, | ||
"execution_count": 29, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 46, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import geonamescache\n", | ||
"\n", | ||
"gc = geonamescache.GeonamesCache()\n", | ||
"countries = gc.get_countries()\n", | ||
"cities = gc.get_cities()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 58, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"country_names = []\n", | ||
"country_ids = list(countries.keys())\n", | ||
"for country_id in country_ids:\n", | ||
" country_names.append(countries[country_id]['name'])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 60, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"252" | ||
] | ||
}, | ||
"execution_count": 60, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(country_names)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 73, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"['Andorra',\n", | ||
" 'United Arab Emirates',\n", | ||
" 'Afghanistan',\n", | ||
" 'Antigua and Barbuda',\n", | ||
" 'Anguilla',\n", | ||
" 'Albania',\n", | ||
" 'Armenia',\n", | ||
" 'Angola',\n", | ||
" 'Antarctica',\n", | ||
" 'Argentina',\n", | ||
" 'American Samoa',\n", | ||
" 'Austria',\n", | ||
" 'Australia',\n", | ||
" 'Aruba',\n", | ||
" 'Aland Islands',\n", | ||
" 'Azerbaijan',\n", | ||
" 'Bosnia and Herzegovina',\n", | ||
" 'Barbados',\n", | ||
" 'Bangladesh',\n", | ||
" 'Belgium']" | ||
] | ||
}, | ||
"execution_count": 73, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"country_names[:20]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 68, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"city_names = []\n", | ||
"city_ids = list(cities.keys())\n", | ||
"for city_id in city_ids:\n", | ||
" city_names.append(cities[city_id]['name'])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 70, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"24336" | ||
] | ||
}, | ||
"execution_count": 70, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(city_names)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 94, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"['Andorra la Vella',\n", | ||
" 'Umm Al Quwain City',\n", | ||
" 'Ras Al Khaimah City',\n", | ||
" 'Zayed City',\n", | ||
" 'Khawr Fakkān',\n", | ||
" 'Dubai',\n", | ||
" 'Dibba Al-Fujairah',\n", | ||
" 'Dibba Al-Hisn',\n", | ||
" 'Sharjah',\n", | ||
" 'Ar Ruways',\n", | ||
" 'Al Fujairah City',\n", | ||
" 'Al Ain City',\n", | ||
" 'Ajman City',\n", | ||
" 'Adh Dhayd',\n", | ||
" 'Abu Dhabi',\n", | ||
" 'Khalifah A City',\n", | ||
" 'Bani Yas City',\n", | ||
" 'Musaffah',\n", | ||
" 'Al Shamkhah City',\n", | ||
" 'Reef Al Fujairah City']" | ||
] | ||
}, | ||
"execution_count": 94, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"city_names[:20]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 89, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from unidecode import unidecode\n", | ||
"\n", | ||
"city_names_unidecoded = [unidecode(city) for city in city_names]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 93, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"['Andorra la Vella',\n", | ||
" 'Umm Al Quwain City',\n", | ||
" 'Ras Al Khaimah City',\n", | ||
" 'Zayed City',\n", | ||
" 'Khawr Fakkan',\n", | ||
" 'Dubai',\n", | ||
" 'Dibba Al-Fujairah',\n", | ||
" 'Dibba Al-Hisn',\n", | ||
" 'Sharjah',\n", | ||
" 'Ar Ruways',\n", | ||
" 'Al Fujairah City',\n", | ||
" 'Al Ain City',\n", | ||
" 'Ajman City',\n", | ||
" 'Adh Dhayd',\n", | ||
" 'Abu Dhabi',\n", | ||
" 'Khalifah A City',\n", | ||
" 'Bani Yas City',\n", | ||
" 'Musaffah',\n", | ||
" 'Al Shamkhah City',\n", | ||
" 'Reef Al Fujairah City']" | ||
] | ||
}, | ||
"execution_count": 93, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"city_names_unidecoded[:20]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 104, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pattern_country = '|'.join(country_names)\n", | ||
"pattern_city = '|'.join(city_names)\n", | ||
"\n", | ||
"def pattern_searcher(search_str:str, search_list:str):\n", | ||
" search_obj = re.search(search_list, search_str)\n", | ||
" if search_obj :\n", | ||
" return_str = search_str[search_obj.start(): search_obj.end()]\n", | ||
" else:\n", | ||
" return_str = 'NA'\n", | ||
" return return_str\n", | ||
"\n", | ||
"df['country'] = df['headline'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern_country))\n", | ||
"df['city'] = df['headline'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern_city))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 107, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = df.replace('NA', np.nan)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 108, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"headline 0\n", | ||
"country 633\n", | ||
"city 40\n", | ||
"dtype: int64" | ||
] | ||
}, | ||
"execution_count": 108, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df.isnull().sum()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |