-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
130 lines (106 loc) · 4.25 KB
/
scraper.py
1
import requestsfrom bs4 import BeautifulSoupimport pandas as pdfrom datetime import datetime, timedeltaimport osfrom tqdm import tqdmimport re# import time# Headers to mimic a browser requestheaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1',}def follow_link_to_htm(url): try: page_response = requests.get(url, headers=headers) if page_response.status_code != 200: print(f"Warning: Failed to fetch {url}, status code: {page_response.status_code}") return None page_soup = BeautifulSoup(page_response.content, 'html.parser') if not page_soup: print(f"Warning: Empty content for {url}") return None htm_link = page_soup.find('a', string=re.compile("View TXT in new window", re.IGNORECASE)) if htm_link: htm_url = f"https://www.congress.gov{htm_link['href']}" return htm_url else: print(f"Warning: Could not find 'View TXT in new window' link for {url}") return None except Exception as e: print(f"Error following link {url}: {str(e)}") return Nonedef get_cr_df(date, section): print(f"Scraping {date} - {section}") url = f"https://www.congress.gov/congressional-record/{date.strftime('%Y/%m/%d')}/{section}" response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'html.parser') pages = soup.find_all('a') data = [] for page in pages: href = page.get('href') if href and 'article' in href: full_url = f"https://www.congress.gov{href}" file_name = f"CREC-{date.strftime('%Y-%m-%d')}-{href.split('/')[-1]}.htm" # Follow the link to find the HTM file htm_url = follow_link_to_htm(full_url) if not htm_url: print(f"Warning: Could not find HTM link for {full_url}") data.append({ 'header': page.text.strip(), 'date': date, 'section': section, 'url': full_url, 'file': file_name, 'htm_url': htm_url }) #time.sleep(1.5) return pd.DataFrame(data)def get_cr_htm(row, output_dir): if row['htm_url']: try: response = requests.get(row['htm_url'], headers=headers) response.raise_for_status() # Raise an exception for bad status codes content = response.text except requests.RequestException as e: print(f"Error downloading {row['htm_url']}: {e}") return else: print(f"No HTM link available for: {row['url']}") return if content: file_path = os.path.join(output_dir, row['file']) try: with open(file_path, 'w', encoding='utf-8') as f: f.write(content) print(f"Downloaded: {row['file']}") except IOError as e: print(f"Error writing file {row['file']}: {e}") else: print(f"No content extracted for: {row['url']}")def main(): start_date = datetime(2024, 1, 1) end_date = datetime(2024, 1, 10) date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)] sections = ['senate-section', 'house-section', 'extensions-of-remarks-section'] # Create data directory data_dir = 'data' os.makedirs(data_dir, exist_ok=True) all_data = [] for date in date_range: for section in sections: df = get_cr_df(date, section) all_data.append(df) cr_metadata = pd.concat(all_data, ignore_index=True) # Save metadata cr_metadata.to_csv(os.path.join(data_dir, 'cr_metadata.csv'), index=False) # Download HTM files for _, row in tqdm(cr_metadata.iterrows(), total=len(cr_metadata)): get_cr_htm(row, data_dir)if __name__ == "__main__": main()