diff --git a/.gitignore b/.gitignore
index 369e8e6..ebe127e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -186,3 +186,4 @@ venv.bak/
### Dynaconf config
**/*.local.yml
**/.secrets.yml
+*.csv
diff --git a/README.md b/README.md
index 15bffdb..910b0b0 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,7 @@ Options:
2(thorough) for max_depth=2, default 1
--max-page INTEGER Max page number to crawl, default 100000
--max-depth INTEGER Max depth to crawl, default 1
- -o, --outfile FILE Output result to specified file
+ -o, --outfile FILE Output result to specified file in csv format
-s, --status TEXT Filter response status to display, seperated by
commas, e.g. 200,300-400
-x, --proxy TEXT Set proxy, e.g. http://127.0.0.1:8080,
@@ -122,9 +122,9 @@ to `--max-depth 2`. By default the normal mode `-m 1` is adopted with max depth
secretscraper -u https://scrapeme.live/shop/ -m 2
```
-#### Write Results to File
+#### Write Results to Csv File
```bash
-secretscraper -u https://scrapeme.live/shop/ -o result.log
+secretscraper -u https://scrapeme.live/shop/ -o result.csv
```
#### Hide Regex Result
@@ -138,6 +138,15 @@ secretscraper -u https://scrapeme.live/shop/ -H
secretscraper -l
```
+#### Switch to hyperscan
+I have implemented the regex matching functionality with both `hyperscan` and `re` module, `re` module is used as default, if you purse higher performance, you can switch to `hyperscan` by changing the `handler_type` to `hyperscan` in `settings.yml`.
+
+There are some pitfalls of `hyperscan` which you have to take caution to use it:
+1. not support regex group: you can not extract content by parentheses.
+2. different syntax from `re`
+
+You'd better write regex separately for the two regex engine.
+
#### Customize Configuration
The built-in config is shown as below. You can assign custom configuration via `-i settings.yml`.
```yaml
@@ -145,6 +154,7 @@ verbose: false
debug: false
loglevel: critical
logpath: log
+handler_type: re
proxy: "" # http://127.0.0.1:7890
max_depth: 1 # 0 for no limit
@@ -231,6 +241,10 @@ rules:
---
# Change Log
+## 2024.5.25 Version 1.4
+- Support csv output
+- Set `re` module as regex engine by default
+- Support to select regex engine by configuration `handler_type`
## 2024.4.30 Version 1.3.9
- Add `--validate` option: Validate urls after the crawler finish, which helps reduce useless links
- Optimize url collector
diff --git a/pyproject.toml b/pyproject.toml
index ab203a5..1fbf384 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "secretscraper"
-version = "1.3.9.4"
+version = "1.4"
description = "SecretScraper is a web scraper tool that can scrape the content through target websites and extract secret information via regular expression."
readme = "README.md"
authors = ["Padishah "]
diff --git a/src/secretscraper/__init__.py b/src/secretscraper/__init__.py
index d2f2795..91d3a4c 100644
--- a/src/secretscraper/__init__.py
+++ b/src/secretscraper/__init__.py
@@ -1,3 +1,3 @@
"""SecretScraper"""
-__version__ = "1.3.9.4"
+__version__ = "1.4"
diff --git a/src/secretscraper/cmdline.py b/src/secretscraper/cmdline.py
index f68898e..8f3af55 100644
--- a/src/secretscraper/cmdline.py
+++ b/src/secretscraper/cmdline.py
@@ -1,10 +1,11 @@
"""Command line"""
-
+import dataclasses
import functools
import logging
import pathlib
import click
+import dynaconf
from click import Context
from dynaconf.base import Settings
@@ -17,6 +18,13 @@
facade_obj = None
+@dataclasses.dataclass
+class ExternalEntry:
+ """Expose objects for external library"""
+ facade_obj: CrawlerFacade
+ facade_settings: dynaconf.Dynaconf
+
+
# @click.group(invoke_without_command=True)
# @click.pass_context
@click.command()
@@ -78,7 +86,7 @@
@click.option(
"-o",
"--outfile",
- help="Output result to specified file",
+ help="Output result to specified file in csv format",
type=click.Path(
exists=False, file_okay=True, dir_okay=False, path_type=pathlib.Path
),
@@ -105,6 +113,10 @@
type=click.Path(exists=True, file_okay=True, dir_okay=True, path_type=pathlib.Path))
def main(**options):
"""Main commands"""
+ start(options)
+
+
+def start(options: dict):
if options["version"]:
click.echo(__version__)
exit(0)
@@ -138,7 +150,9 @@ def main(**options):
else:
facade = CrawlerFacade(settings, options_dict, print_func=print_func)
facade_obj = facade
+ ExternalEntry.facade_obj = facade
facade_settings = facade.settings
+ ExternalEntry.facade_settings = facade_settings
except FacadeException as e:
click.echo(f"Error: {e}")
exit(1)
@@ -157,6 +171,7 @@ def generate_configuration(file: pathlib.Path):
debug: false
loglevel: critical
logpath: log
+handler_type: re
proxy: "" # http://127.0.0.1:7890
max_depth: 1 # 0 for no limit
diff --git a/src/secretscraper/crawler.py b/src/secretscraper/crawler.py
index 4a2d465..eba2300 100644
--- a/src/secretscraper/crawler.py
+++ b/src/secretscraper/crawler.py
@@ -28,7 +28,7 @@
from .config import settings
from .exception import CrawlerException
-from .util import Range
+from .util import Range, get_response_title
logger = logging.getLogger(__name__)
@@ -244,7 +244,12 @@ async def process_one(self, url_node: URLNode):
response = await self.fetch(url_node.url)
if response is not None: # and response.status == 200
url_node.response_status = str(response.status_code)
-
+ url_node.title = get_response_title(response)
+ try:
+ url_node.content_length = int(response.headers.get('content-length'))
+ except Exception:
+ pass
+ url_node.content_type = response.headers.get('content-type')
response_text: str = response.text
# try:
# response_text: str = await response.text(
diff --git a/src/secretscraper/entity.py b/src/secretscraper/entity.py
index 3912fae..4f5d68d 100644
--- a/src/secretscraper/entity.py
+++ b/src/secretscraper/entity.py
@@ -23,6 +23,9 @@ class URLNode:
response_status: str = field(default="Unknown", hash=False, compare=False)
depth: int = field(default=0, hash=False, compare=False)
parent: typing.Optional["URLNode"] = field(default=None, hash=False, compare=False)
+ content_length: int = field(hash=False, compare=False, default=-1)
+ content_type: str = field(hash=False, compare=False, default="")
+ title: str = field(hash=False, compare=False, default="")
def __post_init__(self):
if self.parent is not None and self.depth <= self.parent.depth:
diff --git a/src/secretscraper/facade.py b/src/secretscraper/facade.py
index b1ef02a..e22ec4c 100644
--- a/src/secretscraper/facade.py
+++ b/src/secretscraper/facade.py
@@ -30,11 +30,12 @@
def print_func(f: typing.IO, func: typing.Callable, content: str, **kwargs) -> None:
func(content, **kwargs)
- func(content, file=f, **kwargs)
+ if f is not None:
+ func(content, file=f, **kwargs)
def print_func_colorful(
- f: typing.IO,
+ f: typing.Optional[typing.IO],
func: typing.Callable,
content: str,
fg: str = None,
@@ -70,7 +71,7 @@ def __init__(
self.custom_settings = custom_settings
self.formatter = Formatter()
self.hide_regex: bool = False
- self.outfile = pathlib.Path(__file__).parent / "crawler.log"
+ self.outfile: typing.Optional[pathlib.Path] = None # pathlib.Path(__file__).parent / "crawler.log"
self.print_func = print_func
self.debug: bool = False
self.follow_redirects: bool = False
@@ -79,57 +80,62 @@ def __init__(
def start(self):
"""Start the crawler and output"""
- with self.outfile.open("w") as f:
- try:
-
- # print_func(f"Starting crawler...")
- print_func_colorful(f,
- self.print_func,
- f"Target URLs: {', '.join(self.crawler.start_urls)}",
- bold=True,
- blink=True,
- )
- self.crawler.start()
- self.crawler.start_validate()
- if self.detail_output:
- # print_func_colorful(self.print_func,f"Total page: {self.crawler.total_page}")
- f.write(self.formatter.output_url_hierarchy(self.crawler.url_dict, True))
-
- if not self.hide_regex:
- print_func_colorful(f, self.print_func,
- f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
- )
- print_func_colorful(f, self.print_func, f"{self.formatter.output_js(self.crawler.js_dict)}")
- f.write(self.formatter.output_found_domains(list(self.crawler.found_urls), True))
- else:
- # tidy output
- # URLs per domain
- domains = set()
- for url in self.crawler.start_urls:
- try:
- obj = urlparse(url)
- domain, _ = to_host_port(obj.netloc)
- if len(domain) > 0:
- domains.add(domain.strip())
- except:
- pass
- f.write(self.formatter.output_url_per_domain(domains, self.crawler.url_dict))
- # JS per domain
- f.write(self.formatter.output_url_per_domain(domains, self.crawler.js_dict, "JS"))
- # Domains
- f.write(self.formatter.output_found_domains(list(self.crawler.found_urls), True))
- # Secrets
- if not self.hide_regex:
- print_func_colorful(f, self.print_func,
- f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
- )
- except KeyboardInterrupt:
- self.print_func("\nExiting...")
- self.crawler.close_all()
- except Exception as e:
- self.print_func(f"Unexpected error: {e}.\nExiting...")
- self.crawler.close_all()
- # raise FacadeException from e
+ # with self.outfile.open("w") as f:
+ f = None
+ try:
+
+ # print_func(f"Starting crawler...")
+ print_func_colorful(f,
+ self.print_func,
+ f"Target URLs: {', '.join(self.crawler.start_urls)}",
+ bold=True,
+ blink=True,
+ )
+ self.crawler.start()
+ self.crawler.start_validate()
+ if self.detail_output:
+ # print_func_colorful(self.print_func,f"Total page: {self.crawler.total_page}")
+ self.formatter.output_url_hierarchy(self.crawler.url_dict, True)
+
+ if not self.hide_regex:
+ print_func_colorful(f, self.print_func,
+ f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
+ )
+ print_func_colorful(f, self.print_func, f"{self.formatter.output_js(self.crawler.js_dict)}")
+ self.formatter.output_found_domains(list(self.crawler.found_urls), True)
+ else:
+ # tidy output
+ # URLs per domain
+ domains = set()
+ for url in self.crawler.start_urls:
+ try:
+ obj = urlparse(url)
+ domain, _ = to_host_port(obj.netloc)
+ if len(domain) > 0:
+ domains.add(domain.strip())
+ except:
+ pass
+ self.formatter.output_url_per_domain(domains, self.crawler.url_dict)
+ # JS per domain
+ self.formatter.output_url_per_domain(domains, self.crawler.js_dict, "JS")
+ # Domains
+ self.formatter.output_found_domains(list(self.crawler.found_urls), True)
+ # Secrets
+ if not self.hide_regex:
+ print_func_colorful(f, self.print_func,
+ f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
+ )
+ if self.outfile is not None:
+ self.formatter.output_csv(self.outfile, self.crawler.url_dict, self.crawler.url_secrets)
+ print_func_colorful(None, self.print_func, f"Save result to csv file {self.outfile.name}", fg="green",
+ bold=True)
+ except KeyboardInterrupt:
+ self.print_func("\nExiting...")
+ self.crawler.close_all()
+ except Exception as e:
+ self.print_func(f"Unexpected error: {e}.\nExiting...")
+ self.crawler.close_all()
+ # raise FacadeException from e
def create_crawler(self) -> Crawler:
"""Create a Crawler"""
@@ -260,7 +266,7 @@ def create_crawler(self) -> Crawler:
# Read rules from config file
rules: typing.Dict[str, str] = read_rules_from_setting(self.settings)
- handler_type = self.settings.get("handler_type", "regex")
+ handler_type = self.settings.get("handler_type", "re")
if handler_type == "hyperscan":
handler = get_regex_handler(rules)
print_config(f"Using regex handler: Hyperscan")
diff --git a/src/secretscraper/output_formatter.py b/src/secretscraper/output_formatter.py
index 4862836..ac9f88f 100644
--- a/src/secretscraper/output_formatter.py
+++ b/src/secretscraper/output_formatter.py
@@ -46,6 +46,8 @@ def format_colorful_status(self, status: str) -> str:
return click.style(status, fg="red")
def format_normal_result(self, content: str) -> str:
+ if content == "":
+ return ""
return click.style(content, fg="bright_blue")
def filter(self, url: URLNode) -> bool:
@@ -67,6 +69,13 @@ def filter(self, url: URLNode) -> bool:
return False # default discard
return True
+ def format_single_url(self, url: URLNode) -> str:
+ return self.format_normal_result(f"{str(url.url)}") \
+ + " [" \
+ + self.format_colorful_status(url.response_status) \
+ + "]" \
+ + f" [Content-Length: {self.format_normal_result(str(url.content_length)) if url.content_length > 0 else ''}] [Content-Type: {self.format_normal_result(url.content_type)}] [Title: {self.format_normal_result(url.title)}]"
+
def output_found_domains(
self, found_urls: typing.Iterable[URLNode], is_print: bool = False
) -> str:
@@ -93,7 +102,7 @@ def output_url_hierarchy(
url_hierarchy = ""
for base, urls in url_dict.items():
url_set = {
- f"{str(url.url)} [{str(url.response_status)}]"
+ self.format_single_url(url)
for url in urls
if self.filter(url)
}
@@ -104,10 +113,7 @@ def output_url_hierarchy(
url_hierarchy = ""
for base, urls in url_dict.items():
url_set = {
- self.format_normal_result(f"{str(url.url)}")
- + " ["
- + self.format_colorful_status(url.response_status)
- + "]"
+ self.format_single_url(url)
for url in urls
if self.filter(url)
}
@@ -148,10 +154,7 @@ def output_url_per_domain(
if urls is None or len(urls) == 0:
continue
url_set = {
- self.format_normal_result(f"{str(url.url)}")
- + " ["
- + self.format_colorful_status(url.response_status)
- + "]"
+ self.format_single_url(url)
for url in urls
if self.filter(url)
}
@@ -227,3 +230,26 @@ def output_local_scan_secrets(self, path_secrets: typing.Dict[pathlib.Path, typi
result += s
click.echo(s)
return result
+
+ def output_csv(
+ self,
+ outfile: pathlib.Path,
+ url_dict: typing.Dict[URLNode, typing.Iterable[URLNode]],
+ url_secrets: typing.Dict[URLNode, typing.Iterable[Secret]],
+
+ ) -> None:
+ import csv
+ with outfile.open("w", encoding='utf-8', errors='replace') as f:
+ writer = csv.writer(f)
+ writer.writerow(("URL", "Title", "Response Code", "Content Length", "Content Type", "Secrets"))
+ url_nodes: typing.Set[URLNode] = set()
+ for key, urls in url_dict.items():
+ url_nodes.add(key)
+ for url in urls:
+ url_nodes.add(url)
+ for url in url_nodes:
+ row = [url.url, url.title, url.response_status, url.content_length, url.content_type]
+ if url in url_secrets:
+ secrets = [f"{secret.type}: {secret.data}" for secret in url_secrets[url]]
+ row += ['\n'.join(secrets)]
+ writer.writerow(row)
diff --git a/src/secretscraper/util.py b/src/secretscraper/util.py
index d92d688..37a766f 100644
--- a/src/secretscraper/util.py
+++ b/src/secretscraper/util.py
@@ -3,8 +3,12 @@
import re
import typing
from collections import namedtuple
+from pathlib import Path
from urllib.parse import urlparse
+from threading import Thread
+import requests
import tldextract
+from bs4 import BeautifulSoup
# from dynaconf import LazySettings
@@ -99,3 +103,37 @@ def is_hyperscan() -> bool:
return True
except ImportError:
return False
+
+
+def get_response_title(response: requests.Response) -> str:
+ """Get the response title"""
+ bs = BeautifulSoup(response.text, "html.parser")
+ titles = list()
+ for t in bs.find_all('title'):
+ text = t.get_text()
+ titles.append(text.replace("\n", " ").replace("\r", " ").strip())
+ return "|".join(titles)
+
+
+import http.server
+
+
+def start_local_test_http_server(host: str, port: int, server_dir: Path = None) -> tuple[
+ Thread, http.server.HTTPServer]:
+ """Start local test server"""
+
+ if server_dir is None:
+ DIR = str(
+ Path(__file__).parent.parent.parent.joinpath("tests").joinpath("resources").joinpath(
+ "local_server").absolute())
+ else:
+ DIR = str(server_dir.absolute())
+
+ class Handler(http.server.SimpleHTTPRequestHandler):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, directory=DIR, **kwargs)
+
+ httpd = http.server.HTTPServer((host, port), Handler)
+ thread = Thread(target=httpd.serve_forever)
+ thread.start()
+ return thread, httpd
diff --git a/tests/local_tests/local_test_crawler.py b/tests/local_tests/local_test_crawler.py
index 96815b7..d307539 100644
--- a/tests/local_tests/local_test_crawler.py
+++ b/tests/local_tests/local_test_crawler.py
@@ -2,6 +2,7 @@
import functools
import logging
+import typing
import aiohttp
import pytest
diff --git a/tests/test_facade.py b/tests/test_facade.py
index 511b53a..ee7f782 100644
--- a/tests/test_facade.py
+++ b/tests/test_facade.py
@@ -231,13 +231,18 @@ def test_crawler_facade_update_crawler(
)
def test_normal_run(clicker: CliRunner, invoke_args: typing.List[str]):
- result = clicker.invoke(main, invoke_args)
- if result.exception is not None:
- logger.exception(result.exception)
- raise result.exception
- with click.open_file("1.log", "w") as f:
- click.echo(result.output, file=f)
- print(result)
+ from secretscraper.util import start_local_test_http_server
+ thread, httpd = start_local_test_http_server("127.0.0.1", 8888)
+ try:
+ result = clicker.invoke(main, invoke_args)
+ if result.exception is not None:
+ logger.exception(result.exception)
+ raise result.exception
+ with click.open_file("1.log", "w") as f:
+ click.echo(result.output, file=f)
+ print(result)
+ finally:
+ httpd.shutdown()
# @pytest.mark.parametrize( # TODO: cannot copy file in github actions
diff --git a/tests/test_util.py b/tests/test_util.py
index 8646799..554d68f 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -1,4 +1,6 @@
-from secretscraper.util import read_rules_from_setting
+from secretscraper.util import read_rules_from_setting, start_local_test_http_server
+import requests
+from signal import pthread_kill, SIGKILL
from . import settings
@@ -6,3 +8,15 @@
def test_read_rules_from_setting():
d = read_rules_from_setting(settings)
assert len(d) > 0
+
+
+def test_start_local_test_http_server():
+ thread, httpd = start_local_test_http_server("127.0.0.1", 8888)
+ res = requests.get(f"http://127.0.0.1:8888/index.html")
+ try:
+ assert res.status_code == 200
+ except AssertionError as e:
+ raise e
+ finally:
+ httpd.shutdown()
+ print(1)