scraper.py

import requestsfrom bs4 import BeautifulSoupimport pandas as pdfrom datetime import datetime, timedeltaimport osfrom tqdm import tqdmimport re# import time# Headers to mimic a browser requestheaders = {    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',    'Accept-Language': 'en-US,en;q=0.5',    'DNT': '1',    'Connection': 'keep-alive',    'Upgrade-Insecure-Requests': '1',}def follow_link_to_htm(url):    try:        page_response = requests.get(url, headers=headers)        if page_response.status_code != 200:            print(f"Warning: Failed to fetch {url}, status code: {page_response.status_code}")            return None                page_soup = BeautifulSoup(page_response.content, 'html.parser')                if not page_soup:            print(f"Warning: Empty content for {url}")            return None                htm_link = page_soup.find('a', string=re.compile("View TXT in new window", re.IGNORECASE))                if htm_link:            htm_url = f"https://www.congress.gov{htm_link['href']}"            return htm_url        else:            print(f"Warning: Could not find 'View TXT in new window' link for {url}")            return None    except Exception as e:        print(f"Error following link {url}: {str(e)}")        return Nonedef get_cr_df(date, section):    print(f"Scraping {date} - {section}")    url = f"https://www.congress.gov/congressional-record/{date.strftime('%Y/%m/%d')}/{section}"        response = requests.get(url, headers=headers)    soup = BeautifulSoup(response.content, 'html.parser')        pages = soup.find_all('a')        data = []    for page in pages:        href = page.get('href')        if href and 'article' in href:            full_url = f"https://www.congress.gov{href}"            file_name = f"CREC-{date.strftime('%Y-%m-%d')}-{href.split('/')[-1]}.htm"                        # Follow the link to find the HTM file            htm_url = follow_link_to_htm(full_url)                        if not htm_url:                print(f"Warning: Could not find HTM link for {full_url}")                        data.append({                'header': page.text.strip(),                'date': date,                'section': section,                'url': full_url,                'file': file_name,                'htm_url': htm_url            })            #time.sleep(1.5)        return pd.DataFrame(data)def get_cr_htm(row, output_dir):    if row['htm_url']:        try:            response = requests.get(row['htm_url'], headers=headers)            response.raise_for_status()  # Raise an exception for bad status codes            content = response.text        except requests.RequestException as e:            print(f"Error downloading {row['htm_url']}: {e}")            return    else:        print(f"No HTM link available for: {row['url']}")        return    if content:        file_path = os.path.join(output_dir, row['file'])        try:            with open(file_path, 'w', encoding='utf-8') as f:                f.write(content)            print(f"Downloaded: {row['file']}")        except IOError as e:            print(f"Error writing file {row['file']}: {e}")    else:        print(f"No content extracted for: {row['url']}")def main():    start_date = datetime(2024, 1, 1)    end_date = datetime(2024, 1, 10)    date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]        sections = ['senate-section', 'house-section', 'extensions-of-remarks-section']        # Create data directory    data_dir = 'data'    os.makedirs(data_dir, exist_ok=True)        all_data = []    for date in date_range:        for section in sections:            df = get_cr_df(date, section)            all_data.append(df)        cr_metadata = pd.concat(all_data, ignore_index=True)        # Save metadata    cr_metadata.to_csv(os.path.join(data_dir, 'cr_metadata.csv'), index=False)        # Download HTM files    for _, row in tqdm(cr_metadata.iterrows(), total=len(cr_metadata)):        get_cr_htm(row, data_dir)if __name__ == "__main__":    main()