Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
PadishahIII committed Apr 12, 2024
1 parent 5f571d9 commit 6cfa65e
Show file tree
Hide file tree
Showing 22 changed files with 1,766 additions and 14 deletions.
430 changes: 429 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pytest-asyncio = "^0.23.6"
pytest-benchmark = "^4.0.0"
hyperscan = "^0.7.7"
bs4 = "^0.0.2"
aiohttp = "^3.9.4"

[tool.poetry.group.dev.dependencies]
pylint = "^2.17.4"
Expand All @@ -42,8 +43,11 @@ log_cli = true
log_cli_level = "INFO"
log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
addopts = "--durations=10"
addopts = "--durations=10 --cov-report html --cov=src"
junit_duration_report = "total"
filterwarnings = [
"ignore"
]

[tool.pylint.design]
max-line-length = 120
349 changes: 349 additions & 0 deletions requirements.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/secretscraper/coroutinue.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def create_pool(
event_loop: asyncio.AbstractEventLoop,
):
"""Factory function for creating AsyncPoolCollector
:param queue_capacity: maximum size of task queue, 0 for infinite queue
:return: AsyncPoolCollector
"""
pool = (
Expand Down
130 changes: 130 additions & 0 deletions src/secretscraper/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""The facade interfaces to integrate crawler, filter, and handler"""
import asyncio
import logging
import queue
import typing
from typing import Set
from urllib.parse import urlparse

import aiohttp
from aiohttp import ClientResponse

from secretscraper.coroutinue import AsyncPoolCollector, AsyncTask
from secretscraper.entity import URL, Secret, URLNode
from secretscraper.filter import URLFilter
from secretscraper.handler import Handler
from secretscraper.urlparser import URLParser

from .exception import CrawlerException

logger = logging.getLogger(__name__)


class Crawler:
"""Crawler interface"""

def __init__(self,
start_urls: list[str],
client: aiohttp.ClientSession,
url_filter: URLFilter,
parser: URLParser,
handler: Handler,
max_page_num: int = 0,
max_depth: int = 3,
num_workers: int = 100,
):
"""
:param start_urls: urls to start crawl from
:param client: aiohttp client
:param url_filter: determine whether a url should be crawled
:param parser: extract child url nodes from html
:param handler: how to deal with the crawl result
:param max_page_num: max number of urls to crawl, 0 for no limit
:param max_depth: max url depth, should greater than 0
:param num_workers: worker number of the async pool
"""
self.start_urls = start_urls
self.client = client
self.filter = url_filter
self.parser = parser
self.handler = handler
self.max_page_num = max_page_num
self.max_depth = max_depth
self.num_workers = num_workers

self.visited_urls: Set[URLNode] = set()
self.found_urls: Set[URLNode] = set() # newly found urls
self.working_queue: queue.Queue[URLNode] = queue.Queue() # BP queue
self.url_dict: dict[URLNode, set[URLNode]] = dict() # url and all of its children url
self.total_page: int = 0 # total number of pages found
self.url_secrets: dict[URLNode, set[Secret]] = dict() # url and secrets found from it
self._event_loop = asyncio.new_event_loop()
self.pool: AsyncPoolCollector = AsyncPoolCollector.create_pool(
num_workers=num_workers,
queue_capacity=0,
event_loop=self._event_loop
)

def start(self):
"""Start event loop"""
self._event_loop.run_until_complete(self.run())

async def run(self):
"""Start the crawler"""
try:

# initialize with start_urls
for url in self.start_urls:
url_obj = urlparse(url)
url_node = URLNode(url=url, url_object=url_obj, depth=0, parent=None)
# self.found_urls.add(url_node)
self.visited_urls.add(url_node)
self.working_queue.put(url_node)

while True:
if self.working_queue.empty() and self.pool.is_finish:
break

try:
url_node = self.working_queue.get_nowait()
except queue.Empty:
await asyncio.sleep(0.1)
continue
if url_node.depth <= self.max_depth:
task = AsyncTask(self.process_one, url_node)
await self.pool.submit(task)
except Exception as e:
await self.clean()
raise CrawlerException("Unexpected Exception") from e

async def process_one(self, url_node: URLNode):
"""Fetch, extract url children and execute handler on result"""
response = await self.fetch(url_node.url)
if response.status == 200 and response.content_type == 'text/html':
# call handler and urlparser
# extract secrets TODO: nonblocking extract
response_text: str = await response.text(encoding="utf8", errors="ignore")
secrets = self.handler.handle(response_text)
if secrets is not None:
self.url_secrets[url_node] = set(secrets)
# extract links TODO: nonblocking extract
url_children: set[URLNode] = self.parser.extract_urls(response_text)
for child in url_children:
if child is not None and child not in self.visited_urls:
self.found_urls.add(child)
self.working_queue.put(child)
else:
# no extend on this branch
return
logger.debug(f"Process_one {url_node.url} get response: {response.status} ")

async def fetch(self, url: str) -> ClientResponse:
"""Wrapper for sending http request"""
response = await self.client.get(url, allow_redirects=False)
return response

async def clean(self):
"""Close pool, cancel tasks, close http client session"""
await self.client.close()
await self.pool.close()
18 changes: 12 additions & 6 deletions src/secretscraper/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,19 @@
URL: typing.TypeAlias = ParseResult


@dataclass
@dataclass(unsafe_hash=True, eq=True)
class URLNode:
"""URL node used in site map."""
depth: int
parent: typing.Optional['URLNode']
url: str
url_object: typing.NamedTuple
"""URL node used in site map.
Compare based on url_object.
"""
url: str = field(hash=False, compare=False)
url_object: ParseResult = field(hash=True, compare=True)
depth: int = field(default=0, hash=False, compare=False)
parent: typing.Optional['URLNode'] = field(default=None, hash=False, compare=False)

def __post_init__(self):
if self.parent is not None and self.depth <= self.parent.depth:
raise ValueError(f"URLNode: depth({self.depth}) must be greater than that of parent({self.parent.depth})")


@dataclass(eq=True, frozen=True)
Expand Down
4 changes: 4 additions & 0 deletions src/secretscraper/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ class AsyncPoolException(SecretScraperException):
class HandlerException(SecretScraperException):
"""Exception raised by handlers module"""
pass

class CrawlerException(SecretScraperException):
"""Exception raised by crawler module"""
pass
4 changes: 2 additions & 2 deletions src/secretscraper/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def on_match(id: int, froms: int, to: int, flags: int, context: typing.Optional[
class BSHandler(Handler):
"""BeautifulSoup handler that filter html elements on demand"""

def __init__(self, filter_func: typing.Callable[[BeautifulSoup], BSResult]) -> None:
def __init__(self, filter_func: typing.Callable[[BeautifulSoup], list[BSResult]]) -> None:
self.filter = filter_func

def handle(self, text: str) -> typing.Iterable[Secret]:
Expand All @@ -121,7 +121,7 @@ def handle(self, text: str) -> typing.Iterable[Secret]:
:param text: should be in html format
"""
soup = BeautifulSoup(text, "html.parser")
result: BSResult = self.filter(soup)
result: list[BSResult] = self.filter(soup)
results: list[Secret] = list()
if result is not None:
secret = Secret(type="HTML Element", data=result)
Expand Down
36 changes: 36 additions & 0 deletions src/secretscraper/output_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Output the crawl result to file or terminal"""
import typing

from .entity import URL, Secret, URLNode


class Formatter:
"""Colorful output for terminal and non-colorful output for out-file"""

def __init__(self, out_file: typing.IO = None) -> None:
self.out_file = out_file

def output_found_domains(self, found_urls: typing.Iterable[URLNode]) -> str:
"""Output the found domains"""
found_urls_str = "\n".join(str(url) for url in found_urls)
return found_urls_str

def output_url_hierarchy(self, url_dict: dict[URLNode, typing.Iterable[URLNode]]) -> str:
"""Output the url hierarchy"""
url_hierarchy = ""
for base, urls in url_dict.items():
urls_str = "\n\t".join(str(url.url) for url in urls)
url_hierarchy += f"{base}:\n\t{urls_str}\n"
return url_hierarchy

def output_secrets(self, url_secrets: dict[URLNode, typing.Iterable[Secret]]) -> str:
"""Output all secrets found
:type secrets: dict[str, set[Secret]]
:param secrets: dict keys indicate url and values indicate the secrets found from the url
"""
url_secrets = ""
for url, secrets in url_secrets:
secrets_str = "\n\t".join(str(secret) for secret in secrets)
url_secrets += f"{url.url}:\n\t{secrets_str}\n"
return url_secrets
42 changes: 42 additions & 0 deletions src/secretscraper/urlparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Extract URL nodes in HTML page."""
from typing import Set
from urllib.parse import ParseResult, urlparse

from bs4 import BeautifulSoup

from .entity import URL, URLNode


class URLParser:
"""Extract URL nodes in HTML"""

def __init__(self):
pass

def extract_urls(self, base_url: URLNode, text: str) -> Set[URLNode]:
"""Extract URL nodes"""
found_urls: Set[URLNode] = set()
soup = BeautifulSoup(text, "html.parser")
current_depth = base_url.depth + 1
links = soup.find_all("a")

for link in links:
href = str(link['href'])
if href is not None:
url_obj = urlparse(href)
if url_obj.netloc is not None and len(url_obj.netloc) > 0:
# a full url
node = URLNode(depth=current_depth, parent=base_url, url=url_obj.geturl(), url_object=url_obj)
found_urls.add(node)
else:
# only a path on base_url
url_obj = URL(scheme=base_url.url_object.scheme,
netloc=base_url.url_object.netloc,
path=url_obj.path,
params=url_obj.params,
query=url_obj.query,
fragment=url_obj.fragment
)
node = URLNode(depth=current_depth, parent=base_url, url=url_obj.geturl(), url_object=url_obj)
found_urls.add(node)
return found_urls
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def clicker():
"""clicker fixture"""
yield CliRunner()


@pytest.fixture
@functools.cache
def resource_text() -> str:
Expand All @@ -19,6 +20,14 @@ def resource_text() -> str:
return s


@pytest.fixture
@functools.cache
def html_text() -> str:
with open(Path(__file__).parent / 'resources' / 'HackerNews.html') as f:
s = f.read()
return s


@pytest.fixture
@functools.cache
def regex_dict() -> dict[str, str]:
Expand Down
302 changes: 302 additions & 0 deletions tests/resources/HackerNews.html

Large diffs are not rendered by default.

Loading

0 comments on commit 6cfa65e

Please sign in to comment.