update

PadishahIII · Apr 12, 2024 · 6cfa65e · 6cfa65e
1 parent 5f571d9
commit 6cfa65e
Show file tree

Hide file tree

Showing 22 changed files with 1,766 additions and 14 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ pytest-asyncio = "^0.23.6"
 pytest-benchmark = "^4.0.0"
 hyperscan = "^0.7.7"
 bs4 = "^0.0.2"
+aiohttp = "^3.9.4"
 
 [tool.poetry.group.dev.dependencies]
 pylint = "^2.17.4"
@@ -42,8 +43,11 @@ log_cli = true
 log_cli_level = "INFO"
 log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
 log_cli_date_format = "%Y-%m-%d %H:%M:%S"
-addopts = "--durations=10"
+addopts = "--durations=10 --cov-report html --cov=src"
 junit_duration_report = "total"
+filterwarnings = [
+    "ignore"
+]
 
 [tool.pylint.design]
 max-line-length = 120
diff --git a/requirements.txt b/requirements.txt
diff --git a/src/secretscraper/coroutinue.py b/src/secretscraper/coroutinue.py
@@ -130,7 +130,7 @@ def create_pool(
         event_loop: asyncio.AbstractEventLoop,
     ):
         """Factory function for creating AsyncPoolCollector
-
+        :param queue_capacity: maximum size of task queue, 0 for infinite queue
         :return: AsyncPoolCollector
         """
         pool = (

diff --git a/src/secretscraper/crawler.py b/src/secretscraper/crawler.py
@@ -0,0 +1,130 @@
+"""The facade interfaces to integrate crawler, filter, and handler"""
+import asyncio
+import logging
+import queue
+import typing
+from typing import Set
+from urllib.parse import urlparse
+
+import aiohttp
+from aiohttp import ClientResponse
+
+from secretscraper.coroutinue import AsyncPoolCollector, AsyncTask
+from secretscraper.entity import URL, Secret, URLNode
+from secretscraper.filter import URLFilter
+from secretscraper.handler import Handler
+from secretscraper.urlparser import URLParser
+
+from .exception import CrawlerException
+
+logger = logging.getLogger(__name__)
+
+
+class Crawler:
+    """Crawler interface"""
+
+    def __init__(self,
+                 start_urls: list[str],
+                 client: aiohttp.ClientSession,
+                 url_filter: URLFilter,
+                 parser: URLParser,
+                 handler: Handler,
+                 max_page_num: int = 0,
+                 max_depth: int = 3,
+                 num_workers: int = 100,
+                 ):
+        """
+
+        :param start_urls: urls to start crawl from
+        :param client: aiohttp client
+        :param url_filter: determine whether a url should be crawled
+        :param parser: extract child url nodes from html
+        :param handler: how to deal with the crawl result
+        :param max_page_num: max number of urls to crawl, 0 for no limit
+        :param max_depth: max url depth, should greater than 0
+        :param num_workers: worker number of the async pool
+        """
+        self.start_urls = start_urls
+        self.client = client
+        self.filter = url_filter
+        self.parser = parser
+        self.handler = handler
+        self.max_page_num = max_page_num
+        self.max_depth = max_depth
+        self.num_workers = num_workers
+
+        self.visited_urls: Set[URLNode] = set()
+        self.found_urls: Set[URLNode] = set()  # newly found urls
+        self.working_queue: queue.Queue[URLNode] = queue.Queue()  # BP queue
+        self.url_dict: dict[URLNode, set[URLNode]] = dict()  # url and all of its children url
+        self.total_page: int = 0  # total number of pages found
+        self.url_secrets: dict[URLNode, set[Secret]] = dict()  # url and secrets found from it
+        self._event_loop = asyncio.new_event_loop()
+        self.pool: AsyncPoolCollector = AsyncPoolCollector.create_pool(
+            num_workers=num_workers,
+            queue_capacity=0,
+            event_loop=self._event_loop
+        )
+
+    def start(self):
+        """Start event loop"""
+        self._event_loop.run_until_complete(self.run())
+
+    async def run(self):
+        """Start the crawler"""
+        try:
+
+            # initialize with start_urls
+            for url in self.start_urls:
+                url_obj = urlparse(url)
+                url_node = URLNode(url=url, url_object=url_obj, depth=0, parent=None)
+                # self.found_urls.add(url_node)
+                self.visited_urls.add(url_node)
+                self.working_queue.put(url_node)
+
+            while True:
+                if self.working_queue.empty() and self.pool.is_finish:
+                    break
+
+                try:
+                    url_node = self.working_queue.get_nowait()
+                except queue.Empty:
+                    await asyncio.sleep(0.1)
+                    continue
+                if url_node.depth <= self.max_depth:
+                    task = AsyncTask(self.process_one, url_node)
+                    await self.pool.submit(task)
+        except Exception as e:
+            await self.clean()
+            raise CrawlerException("Unexpected Exception") from e
+
+    async def process_one(self, url_node: URLNode):
+        """Fetch, extract url children and execute handler on result"""
+        response = await self.fetch(url_node.url)
+        if response.status == 200 and response.content_type == 'text/html':
+            # call handler and urlparser
+            # extract secrets TODO: nonblocking extract
+            response_text: str = await response.text(encoding="utf8", errors="ignore")
+            secrets = self.handler.handle(response_text)
+            if secrets is not None:
+                self.url_secrets[url_node] = set(secrets)
+            # extract links TODO: nonblocking extract
+            url_children: set[URLNode] = self.parser.extract_urls(response_text)
+            for child in url_children:
+                if child is not None and child not in self.visited_urls:
+                    self.found_urls.add(child)
+                    self.working_queue.put(child)
+        else:
+            # no extend on this branch
+            return
+        logger.debug(f"Process_one {url_node.url} get response: {response.status} ")
+
+    async def fetch(self, url: str) -> ClientResponse:
+        """Wrapper for sending http request"""
+        response = await self.client.get(url, allow_redirects=False)
+        return response
+
+    async def clean(self):
+        """Close pool, cancel tasks, close http client session"""
+        await self.client.close()
+        await self.pool.close()
diff --git a/src/secretscraper/entity.py b/src/secretscraper/entity.py
@@ -7,13 +7,19 @@
 URL: typing.TypeAlias = ParseResult
 
 
-@dataclass
+@dataclass(unsafe_hash=True, eq=True)
 class URLNode:
-    """URL node used in site map."""
-    depth: int
-    parent: typing.Optional['URLNode']
-    url: str
-    url_object: typing.NamedTuple
+    """URL node used in site map.
+    Compare based on url_object.
+    """
+    url: str = field(hash=False, compare=False)
+    url_object: ParseResult = field(hash=True, compare=True)
+    depth: int = field(default=0, hash=False, compare=False)
+    parent: typing.Optional['URLNode'] = field(default=None, hash=False, compare=False)
+
+    def __post_init__(self):
+        if self.parent is not None and self.depth <= self.parent.depth:
+            raise ValueError(f"URLNode: depth({self.depth}) must be greater than that of parent({self.parent.depth})")
 
 
 @dataclass(eq=True, frozen=True)

diff --git a/src/secretscraper/exception.py b/src/secretscraper/exception.py
@@ -16,3 +16,7 @@ class AsyncPoolException(SecretScraperException):
 class HandlerException(SecretScraperException):
     """Exception raised by handlers module"""
     pass
+
+class CrawlerException(SecretScraperException):
+    """Exception raised by crawler module"""
+    pass
diff --git a/src/secretscraper/handler.py b/src/secretscraper/handler.py
@@ -111,7 +111,7 @@ def on_match(id: int, froms: int, to: int, flags: int, context: typing.Optional[
 class BSHandler(Handler):
     """BeautifulSoup handler that filter html elements on demand"""
 
-    def __init__(self, filter_func: typing.Callable[[BeautifulSoup], BSResult]) -> None:
+    def __init__(self, filter_func: typing.Callable[[BeautifulSoup], list[BSResult]]) -> None:
         self.filter = filter_func
 
     def handle(self, text: str) -> typing.Iterable[Secret]:
@@ -121,7 +121,7 @@ def handle(self, text: str) -> typing.Iterable[Secret]:
         :param text: should be in html format
         """
         soup = BeautifulSoup(text, "html.parser")
-        result: BSResult = self.filter(soup)
+        result: list[BSResult] = self.filter(soup)
         results: list[Secret] = list()
         if result is not None:
             secret = Secret(type="HTML Element", data=result)

diff --git a/src/secretscraper/output_formatter.py b/src/secretscraper/output_formatter.py
@@ -0,0 +1,36 @@
+"""Output the crawl result to file or terminal"""
+import typing
+
+from .entity import URL, Secret, URLNode
+
+
+class Formatter:
+    """Colorful output for terminal and non-colorful output for out-file"""
+
+    def __init__(self, out_file: typing.IO = None) -> None:
+        self.out_file = out_file
+
+    def output_found_domains(self, found_urls: typing.Iterable[URLNode]) -> str:
+        """Output the found domains"""
+        found_urls_str = "\n".join(str(url) for url in found_urls)
+        return found_urls_str
+
+    def output_url_hierarchy(self, url_dict: dict[URLNode, typing.Iterable[URLNode]]) -> str:
+        """Output the url hierarchy"""
+        url_hierarchy = ""
+        for base, urls in url_dict.items():
+            urls_str = "\n\t".join(str(url.url) for url in urls)
+            url_hierarchy += f"{base}:\n\t{urls_str}\n"
+        return url_hierarchy
+
+    def output_secrets(self, url_secrets: dict[URLNode, typing.Iterable[Secret]]) -> str:
+        """Output all secrets found
+        :type secrets: dict[str, set[Secret]]
+        :param secrets: dict keys indicate url and values indicate the secrets found from the url
+
+        """
+        url_secrets = ""
+        for url, secrets in url_secrets:
+            secrets_str = "\n\t".join(str(secret) for secret in secrets)
+            url_secrets += f"{url.url}:\n\t{secrets_str}\n"
+        return url_secrets
diff --git a/src/secretscraper/urlparser.py b/src/secretscraper/urlparser.py
@@ -0,0 +1,42 @@
+"""Extract URL nodes in HTML page."""
+from typing import Set
+from urllib.parse import ParseResult, urlparse
+
+from bs4 import BeautifulSoup
+
+from .entity import URL, URLNode
+
+
+class URLParser:
+    """Extract URL nodes in HTML"""
+
+    def __init__(self):
+        pass
+
+    def extract_urls(self, base_url: URLNode, text: str) -> Set[URLNode]:
+        """Extract URL nodes"""
+        found_urls: Set[URLNode] = set()
+        soup = BeautifulSoup(text, "html.parser")
+        current_depth = base_url.depth + 1
+        links = soup.find_all("a")
+
+        for link in links:
+            href = str(link['href'])
+            if href is not None:
+                url_obj = urlparse(href)
+                if url_obj.netloc is not None and len(url_obj.netloc) > 0:
+                    # a full url
+                    node = URLNode(depth=current_depth, parent=base_url, url=url_obj.geturl(), url_object=url_obj)
+                    found_urls.add(node)
+                else:
+                    # only a path on base_url
+                    url_obj = URL(scheme=base_url.url_object.scheme,
+                                  netloc=base_url.url_object.netloc,
+                                  path=url_obj.path,
+                                  params=url_obj.params,
+                                  query=url_obj.query,
+                                  fragment=url_obj.fragment
+                                  )
+                    node = URLNode(depth=current_depth, parent=base_url, url=url_obj.geturl(), url_object=url_obj)
+                    found_urls.add(node)
+        return found_urls
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -11,6 +11,7 @@ def clicker():
     """clicker fixture"""
     yield CliRunner()
 
+
 @pytest.fixture
 @functools.cache
 def resource_text() -> str:
@@ -19,6 +20,14 @@ def resource_text() -> str:
     return s
 
 
+@pytest.fixture
+@functools.cache
+def html_text() -> str:
+    with open(Path(__file__).parent / 'resources' / 'HackerNews.html') as f:
+        s = f.read()
+    return s
+
+
 @pytest.fixture
 @functools.cache
 def regex_dict() -> dict[str, str]:

diff --git a/tests/resources/HackerNews.html b/tests/resources/HackerNews.html