Skip to content

Commit

Permalink
1.4
Browse files Browse the repository at this point in the history
  • Loading branch information
PadishahIII committed May 25, 2024
1 parent d3972f0 commit 54d71bc
Show file tree
Hide file tree
Showing 13 changed files with 209 additions and 81 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,4 @@ venv.bak/
### Dynaconf config
**/*.local.yml
**/.secrets.yml
*.csv
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ Options:
2(thorough) for max_depth=2, default 1
--max-page INTEGER Max page number to crawl, default 100000
--max-depth INTEGER Max depth to crawl, default 1
-o, --outfile FILE Output result to specified file
-o, --outfile FILE Output result to specified file in csv format
-s, --status TEXT Filter response status to display, seperated by
commas, e.g. 200,300-400
-x, --proxy TEXT Set proxy, e.g. http://127.0.0.1:8080,
Expand Down Expand Up @@ -122,9 +122,9 @@ to `--max-depth 2`. By default the normal mode `-m 1` is adopted with max depth
secretscraper -u https://scrapeme.live/shop/ -m 2
```

#### Write Results to File
#### Write Results to Csv File
```bash
secretscraper -u https://scrapeme.live/shop/ -o result.log
secretscraper -u https://scrapeme.live/shop/ -o result.csv
```

#### Hide Regex Result
Expand All @@ -138,13 +138,23 @@ secretscraper -u https://scrapeme.live/shop/ -H
secretscraper -l <dir or file>
```

#### Switch to hyperscan
I have implemented the regex matching functionality with both `hyperscan` and `re` module, `re` module is used as default, if you purse higher performance, you can switch to `hyperscan` by changing the `handler_type` to `hyperscan` in `settings.yml`.

There are some pitfalls of `hyperscan` which you have to take caution to use it:
1. not support regex group: you can not extract content by parentheses.
2. different syntax from `re`

You'd better write regex separately for the two regex engine.

#### Customize Configuration
The built-in config is shown as below. You can assign custom configuration via `-i settings.yml`.
```yaml
verbose: false
debug: false
loglevel: critical
logpath: log
handler_type: re

proxy: "" # http://127.0.0.1:7890
max_depth: 1 # 0 for no limit
Expand Down Expand Up @@ -231,6 +241,10 @@ rules:
---

# Change Log
## 2024.5.25 Version 1.4
- Support csv output
- Set `re` module as regex engine by default
- Support to select regex engine by configuration `handler_type`
## 2024.4.30 Version 1.3.9
- Add `--validate` option: Validate urls after the crawler finish, which helps reduce useless links
- Optimize url collector
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "secretscraper"
version = "1.3.9.4"
version = "1.4"
description = "SecretScraper is a web scraper tool that can scrape the content through target websites and extract secret information via regular expression."
readme = "README.md"
authors = ["Padishah <[email protected]>"]
Expand Down
2 changes: 1 addition & 1 deletion src/secretscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""SecretScraper"""

__version__ = "1.3.9.4"
__version__ = "1.4"
19 changes: 17 additions & 2 deletions src/secretscraper/cmdline.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""Command line"""

import dataclasses
import functools
import logging
import pathlib

import click
import dynaconf
from click import Context
from dynaconf.base import Settings

Expand All @@ -17,6 +18,13 @@
facade_obj = None


@dataclasses.dataclass
class ExternalEntry:
"""Expose objects for external library"""
facade_obj: CrawlerFacade
facade_settings: dynaconf.Dynaconf


# @click.group(invoke_without_command=True)
# @click.pass_context
@click.command()
Expand Down Expand Up @@ -78,7 +86,7 @@
@click.option(
"-o",
"--outfile",
help="Output result to specified file",
help="Output result to specified file in csv format",
type=click.Path(
exists=False, file_okay=True, dir_okay=False, path_type=pathlib.Path
),
Expand All @@ -105,6 +113,10 @@
type=click.Path(exists=True, file_okay=True, dir_okay=True, path_type=pathlib.Path))
def main(**options):
"""Main commands"""
start(options)


def start(options: dict):
if options["version"]:
click.echo(__version__)
exit(0)
Expand Down Expand Up @@ -138,7 +150,9 @@ def main(**options):
else:
facade = CrawlerFacade(settings, options_dict, print_func=print_func)
facade_obj = facade
ExternalEntry.facade_obj = facade
facade_settings = facade.settings
ExternalEntry.facade_settings = facade_settings
except FacadeException as e:
click.echo(f"Error: {e}")
exit(1)
Expand All @@ -157,6 +171,7 @@ def generate_configuration(file: pathlib.Path):
debug: false
loglevel: critical
logpath: log
handler_type: re
proxy: "" # http://127.0.0.1:7890
max_depth: 1 # 0 for no limit
Expand Down
9 changes: 7 additions & 2 deletions src/secretscraper/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

from .config import settings
from .exception import CrawlerException
from .util import Range
from .util import Range, get_response_title

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -244,7 +244,12 @@ async def process_one(self, url_node: URLNode):
response = await self.fetch(url_node.url)
if response is not None: # and response.status == 200
url_node.response_status = str(response.status_code)

url_node.title = get_response_title(response)
try:
url_node.content_length = int(response.headers.get('content-length'))
except Exception:
pass
url_node.content_type = response.headers.get('content-type')
response_text: str = response.text
# try:
# response_text: str = await response.text(
Expand Down
3 changes: 3 additions & 0 deletions src/secretscraper/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ class URLNode:
response_status: str = field(default="Unknown", hash=False, compare=False)
depth: int = field(default=0, hash=False, compare=False)
parent: typing.Optional["URLNode"] = field(default=None, hash=False, compare=False)
content_length: int = field(hash=False, compare=False, default=-1)
content_type: str = field(hash=False, compare=False, default="")
title: str = field(hash=False, compare=False, default="")

def __post_init__(self):
if self.parent is not None and self.depth <= self.parent.depth:
Expand Down
116 changes: 61 additions & 55 deletions src/secretscraper/facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@

def print_func(f: typing.IO, func: typing.Callable, content: str, **kwargs) -> None:
func(content, **kwargs)
func(content, file=f, **kwargs)
if f is not None:
func(content, file=f, **kwargs)


def print_func_colorful(
f: typing.IO,
f: typing.Optional[typing.IO],
func: typing.Callable,
content: str,
fg: str = None,
Expand Down Expand Up @@ -70,7 +71,7 @@ def __init__(
self.custom_settings = custom_settings
self.formatter = Formatter()
self.hide_regex: bool = False
self.outfile = pathlib.Path(__file__).parent / "crawler.log"
self.outfile: typing.Optional[pathlib.Path] = None # pathlib.Path(__file__).parent / "crawler.log"
self.print_func = print_func
self.debug: bool = False
self.follow_redirects: bool = False
Expand All @@ -79,57 +80,62 @@ def __init__(

def start(self):
"""Start the crawler and output"""
with self.outfile.open("w") as f:
try:

# print_func(f"Starting crawler...")
print_func_colorful(f,
self.print_func,
f"Target URLs: {', '.join(self.crawler.start_urls)}",
bold=True,
blink=True,
)
self.crawler.start()
self.crawler.start_validate()
if self.detail_output:
# print_func_colorful(self.print_func,f"Total page: {self.crawler.total_page}")
f.write(self.formatter.output_url_hierarchy(self.crawler.url_dict, True))

if not self.hide_regex:
print_func_colorful(f, self.print_func,
f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
)
print_func_colorful(f, self.print_func, f"{self.formatter.output_js(self.crawler.js_dict)}")
f.write(self.formatter.output_found_domains(list(self.crawler.found_urls), True))
else:
# tidy output
# URLs per domain
domains = set()
for url in self.crawler.start_urls:
try:
obj = urlparse(url)
domain, _ = to_host_port(obj.netloc)
if len(domain) > 0:
domains.add(domain.strip())
except:
pass
f.write(self.formatter.output_url_per_domain(domains, self.crawler.url_dict))
# JS per domain
f.write(self.formatter.output_url_per_domain(domains, self.crawler.js_dict, "JS"))
# Domains
f.write(self.formatter.output_found_domains(list(self.crawler.found_urls), True))
# Secrets
if not self.hide_regex:
print_func_colorful(f, self.print_func,
f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
)
except KeyboardInterrupt:
self.print_func("\nExiting...")
self.crawler.close_all()
except Exception as e:
self.print_func(f"Unexpected error: {e}.\nExiting...")
self.crawler.close_all()
# raise FacadeException from e
# with self.outfile.open("w") as f:
f = None
try:

# print_func(f"Starting crawler...")
print_func_colorful(f,
self.print_func,
f"Target URLs: {', '.join(self.crawler.start_urls)}",
bold=True,
blink=True,
)
self.crawler.start()
self.crawler.start_validate()
if self.detail_output:
# print_func_colorful(self.print_func,f"Total page: {self.crawler.total_page}")
self.formatter.output_url_hierarchy(self.crawler.url_dict, True)

if not self.hide_regex:
print_func_colorful(f, self.print_func,
f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
)
print_func_colorful(f, self.print_func, f"{self.formatter.output_js(self.crawler.js_dict)}")
self.formatter.output_found_domains(list(self.crawler.found_urls), True)
else:
# tidy output
# URLs per domain
domains = set()
for url in self.crawler.start_urls:
try:
obj = urlparse(url)
domain, _ = to_host_port(obj.netloc)
if len(domain) > 0:
domains.add(domain.strip())
except:
pass
self.formatter.output_url_per_domain(domains, self.crawler.url_dict)
# JS per domain
self.formatter.output_url_per_domain(domains, self.crawler.js_dict, "JS")
# Domains
self.formatter.output_found_domains(list(self.crawler.found_urls), True)
# Secrets
if not self.hide_regex:
print_func_colorful(f, self.print_func,
f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
)
if self.outfile is not None:
self.formatter.output_csv(self.outfile, self.crawler.url_dict, self.crawler.url_secrets)
print_func_colorful(None, self.print_func, f"Save result to csv file {self.outfile.name}", fg="green",
bold=True)
except KeyboardInterrupt:
self.print_func("\nExiting...")
self.crawler.close_all()
except Exception as e:
self.print_func(f"Unexpected error: {e}.\nExiting...")
self.crawler.close_all()
# raise FacadeException from e

def create_crawler(self) -> Crawler:
"""Create a Crawler"""
Expand Down Expand Up @@ -260,7 +266,7 @@ def create_crawler(self) -> Crawler:

# Read rules from config file
rules: typing.Dict[str, str] = read_rules_from_setting(self.settings)
handler_type = self.settings.get("handler_type", "regex")
handler_type = self.settings.get("handler_type", "re")
if handler_type == "hyperscan":
handler = get_regex_handler(rules)
print_config(f"Using regex handler: Hyperscan")
Expand Down
Loading

0 comments on commit 54d71bc

Please sign in to comment.