Skip to content

Commit

Permalink
Add notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
susanli2016 authored Jun 14, 2020
1 parent 2173563 commit 7b190b1
Showing 1 changed file with 372 additions and 0 deletions.
372 changes: 372 additions & 0 deletions Outbreaks_Headlines_1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,372 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_fwf('data/headlines.txt')\n",
"df.columns = ['headline']"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>headline</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>Could Zika Reach New York City?</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>First Case of Zika in Miami Beach</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>Mystery Virus Spreads in Recife, Brazil</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>Dallas man comes down with case of Zika</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>Trinidad confirms first Zika case</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" headline\n",
"0 Could Zika Reach New York City?\n",
"1 First Case of Zika in Miami Beach\n",
"2 Mystery Virus Spreads in Recife, Brazil\n",
"3 Dallas man comes down with case of Zika\n",
"4 Trinidad confirms first Zika case"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"import geonamescache\n",
"\n",
"gc = geonamescache.GeonamesCache()\n",
"countries = gc.get_countries()\n",
"cities = gc.get_cities()"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"country_names = []\n",
"country_ids = list(countries.keys())\n",
"for country_id in country_ids:\n",
" country_names.append(countries[country_id]['name'])"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"252"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(country_names)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Andorra',\n",
" 'United Arab Emirates',\n",
" 'Afghanistan',\n",
" 'Antigua and Barbuda',\n",
" 'Anguilla',\n",
" 'Albania',\n",
" 'Armenia',\n",
" 'Angola',\n",
" 'Antarctica',\n",
" 'Argentina',\n",
" 'American Samoa',\n",
" 'Austria',\n",
" 'Australia',\n",
" 'Aruba',\n",
" 'Aland Islands',\n",
" 'Azerbaijan',\n",
" 'Bosnia and Herzegovina',\n",
" 'Barbados',\n",
" 'Bangladesh',\n",
" 'Belgium']"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"country_names[:20]"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"city_names = []\n",
"city_ids = list(cities.keys())\n",
"for city_id in city_ids:\n",
" city_names.append(cities[city_id]['name'])"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"24336"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(city_names)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Andorra la Vella',\n",
" 'Umm Al Quwain City',\n",
" 'Ras Al Khaimah City',\n",
" 'Zayed City',\n",
" 'Khawr Fakkān',\n",
" 'Dubai',\n",
" 'Dibba Al-Fujairah',\n",
" 'Dibba Al-Hisn',\n",
" 'Sharjah',\n",
" 'Ar Ruways',\n",
" 'Al Fujairah City',\n",
" 'Al Ain City',\n",
" 'Ajman City',\n",
" 'Adh Dhayd',\n",
" 'Abu Dhabi',\n",
" 'Khalifah A City',\n",
" 'Bani Yas City',\n",
" 'Musaffah',\n",
" 'Al Shamkhah City',\n",
" 'Reef Al Fujairah City']"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"city_names[:20]"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"from unidecode import unidecode\n",
"\n",
"city_names_unidecoded = [unidecode(city) for city in city_names]"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Andorra la Vella',\n",
" 'Umm Al Quwain City',\n",
" 'Ras Al Khaimah City',\n",
" 'Zayed City',\n",
" 'Khawr Fakkan',\n",
" 'Dubai',\n",
" 'Dibba Al-Fujairah',\n",
" 'Dibba Al-Hisn',\n",
" 'Sharjah',\n",
" 'Ar Ruways',\n",
" 'Al Fujairah City',\n",
" 'Al Ain City',\n",
" 'Ajman City',\n",
" 'Adh Dhayd',\n",
" 'Abu Dhabi',\n",
" 'Khalifah A City',\n",
" 'Bani Yas City',\n",
" 'Musaffah',\n",
" 'Al Shamkhah City',\n",
" 'Reef Al Fujairah City']"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"city_names_unidecoded[:20]"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"pattern_country = '|'.join(country_names)\n",
"pattern_city = '|'.join(city_names)\n",
"\n",
"def pattern_searcher(search_str:str, search_list:str):\n",
" search_obj = re.search(search_list, search_str)\n",
" if search_obj :\n",
" return_str = search_str[search_obj.start(): search_obj.end()]\n",
" else:\n",
" return_str = 'NA'\n",
" return return_str\n",
"\n",
"df['country'] = df['headline'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern_country))\n",
"df['city'] = df['headline'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern_city))"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"df = df.replace('NA', np.nan)"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"headline 0\n",
"country 633\n",
"city 40\n",
"dtype: int64"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 7b190b1

Please sign in to comment.