(backend): Initial backend implementation for Hatchet Scraper Example

hatchet-dev · bigboateng · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
commit 8b64be53fa89a349d45a354d4b624db9d710db94
diff --git a/backend/README.md b/backend/README.md
@@ -0,0 +1,80 @@
+# Hatchet Scraper Example Backend
+
+This is the backend component of the Hatchet Scraper Example project, demonstrating how to use Hatchet with FastAPI for web scraping tasks.
+
+## Prerequisites
+
+Before running this project, make sure you have the following:
+
+1. Python 3.12 or higher installed on your machine.
+2. Poetry package manager installed. You can install it by running `pip install poetry`, or by following instructions in the [Poetry Docs](https://python-poetry.org/docs/#installation)
+
+## Setup
+
+1. Create a `.env` file in the `./backend` directory and set the required environment variables:
+HATCHET_CLIENT_TOKEN="<your-hatchet-api-key>"
+
+2. Install the project dependencies:
+```
+poetry install
+```
+
+## Running the API
+
+To start the backend server, run the following command:
+
+```shell
+poetry run start-api
+```
+
+## Running the Hatchet Worker
+
+In a separate terminal, start the Hatchet worker by running:
+```shell
+poetry run start-worker
+```
+
+## Project Structure
+
+- `src/api/main.py`: FastAPI application setup and endpoints
+- `src/workflows/`: Contains Hatchet workflow definitions
+  - `scraper_workflow.py`: Main scraper workflow
+  - `main.py`: Workflow registration and worker setup
+
+## Workflows
+
+The project contains three main workflows:
+
+1. ScraperWorkflow: Orchestrates the scraping process for both TechCrunch and Google News
+2. TechCrunchAIScraperWorkflow: Scrapes AI-related articles from TechCrunch
+3. GoogleNewsScraperWorkflow: Scrapes top stories from Google News
+
+These workflows are defined in `src/workflows/scraper_workflow.py` and registered in `src/workflows/main.py`.
+
+## API Endpoints
+
+- `POST /scrape`: Initiates the scraping workflow
+- `GET /`: Basic route to confirm the API is working
+
+For more details on the API implementation, refer to `src/api/main.py`.
+
+## Environment Variables
+
+Make sure to set up the following environment variable in your `.env` file:
+
+- `HATCHET_CLIENT_TOKEN`: Your Hatchet API key
+
+## Dependencies
+
+Key dependencies for this project include:
+
+- FastAPI
+- Uvicorn
+- Hatchet SDK
+- Requests
+- BeautifulSoup4
+
+For a complete list of dependencies, refer to the `pyproject.toml` file:
+```toml
+backend/pyproject.toml
+```
diff --git a/backend/__init__.py b/backend/__init__.py
diff --git a/backend/poetry.lock b/backend/poetry.lock
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -0,0 +1,24 @@
+[tool.poetry]
+name = "src"
+version = "0.1.0"
+description = "This project is a quick-start example for building a web scraping API using FastAPI and Hatchet for task management."
+authors = []
+readme = "README.md"
+
+[tool.poetry.scripts]
+start-api = "src.api.main:start"
+start-worker = "src.workflows.main:start"
+
+[tool.poetry.dependencies]
+python = "^3.12"
+fastapi = "^0.112.1"
+uvicorn = "^0.30.6"
+hatchet-sdk = "^0.35.1"
+requests = "^2.32.3"
+beautifulsoup4 = "^4.12.3"
+pydantic = "^2.6.1"
+pydantic-settings = "^2.1.0"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/backend/src/__init__.py b/backend/src/__init__.py
diff --git a/backend/src/api/__init__.py b/backend/src/api/__init__.py
diff --git a/backend/src/api/main.py b/backend/src/api/main.py
@@ -0,0 +1,98 @@
+import json
+import logging
+import os
+from typing import AsyncGenerator
+import asyncio
+
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from hatchet_sdk import Hatchet
+from pydantic import BaseModel
+
+from ..config import settings
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Set the environment variable for Hatchet
+os.environ["HATCHET_CLIENT_TOKEN"] = settings.hatchet_client_token
+
+# Initialize Hatchet client
+hatchet = Hatchet()
+
+# Initialize FastAPI app
+app = FastAPI()
+
+# Define CORS origins
+origins = [
+    "http://localhost:3000",
+    "localhost:3000"
+]
+
+# Apply CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"]
+)
+
+# Define Pydantic model for response
+class ScrapeResponse(BaseModel):
+    messageId: str
+
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to the Hatchet Scraper API!"}
+
+@app.post("/scrape", response_model=ScrapeResponse)
+async def scrape():
+    workflowRun = await hatchet.client.admin.aio.run_workflow("ScraperWorkflow", {})
+    logger.info(f"Started scraping workflow with ID: {workflowRun.workflow_run_id}")
+    return ScrapeResponse(messageId=workflowRun.workflow_run_id)
+
+# Generator function to stream events from a Hatchet workflow
+async def event_stream_generator(workflowRunId):
+    logger.info(f"Starting event stream for workflow run ID: {workflowRunId}")
+    workflowRun = hatchet.client.admin.get_workflow_run(workflowRunId)
+
+    try:
+        async for event in workflowRun.stream():
+            logger.info(f"Received event: {event.type}")
+            data = json.dumps({
+                "type": event.type,
+                "payload": event.payload,
+                "messageId": workflowRunId
+            })
+            yield f"data: {data}\n\n"
+
+        result = await workflowRun.result()
+        logger.info(f"Workflow completed. Result: {result}")
+        data = json.dumps({
+            "type": "result",
+            "payload": result,
+            "messageId": workflowRunId
+        })
+        yield f"data: {data}\n\n"
+    except Exception as e:
+        logger.error(f"Error in event stream: {str(e)}")
+        error_data = json.dumps({
+            "type": "error",
+            "payload": {"message": str(e)},
+            "messageId": workflowRunId
+        })
+        yield f"data: {error_data}\n\n"
+
+@app.get("/message/{messageId}")
+async def stream(messageId: str):
+    return StreamingResponse(event_stream_generator(messageId), media_type='text/event-stream')
+
+def start():
+    uvicorn.run("src.api.main:app", host="0.0.0.0", port=8000, reload=True)
+
+if __name__ == "__main__":
+    start()
diff --git a/backend/src/config.py b/backend/src/config.py
@@ -0,0 +1,9 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+class Settings(BaseSettings):
+    hatchet_client_token: str
+    debug: bool = True
+
+    model_config = SettingsConfigDict(env_file='.env', env_file_encoding='utf-8')
+
+settings = Settings()
diff --git a/backend/src/workflows/hatchet.py b/backend/src/workflows/hatchet.py
@@ -0,0 +1,9 @@
+
+from hatchet_sdk import Hatchet
+import os
+
+from ..config import settings
+
+os.environ["HATCHET_CLIENT_TOKEN"] = settings.hatchet_client_token
+
+hatchet = Hatchet()
diff --git a/backend/src/workflows/main.py b/backend/src/workflows/main.py
@@ -0,0 +1,10 @@
+from .hatchet import hatchet
+from .scraper_workflow import ScraperWorkflow, TechCrunchAIScraperWorkflow, GoogleNewsScraperWorkflow
+
+
+def start():
+    worker = hatchet.worker("scraper-worker")
+    worker.register_workflow(ScraperWorkflow())
+    worker.register_workflow(TechCrunchAIScraperWorkflow())
+    worker.register_workflow(GoogleNewsScraperWorkflow())
+    worker.start()
diff --git a/backend/src/workflows/models.py b/backend/src/workflows/models.py
@@ -0,0 +1,14 @@
+from pydantic import BaseModel
+from typing import List, Optional
+
+class Article(BaseModel):
+    title: Optional[str] = ""
+    author: Optional[str] = ""
+    link: Optional[str] = ""
+    excerpt: Optional[str] = ""
+    published_time: Optional[str] = ""
+
+class ScrapingResult(BaseModel):
+    status: str
+    articles: List[Article] = []
+    message: Optional[str] = None