1.4

PadishahIII · May 25, 2024 · 54d71bc · 54d71bc
1 parent d3972f0
commit 54d71bc
Show file tree

Hide file tree

Showing 13 changed files with 209 additions and 81 deletions.
diff --git a/.gitignore b/.gitignore
@@ -186,3 +186,4 @@ venv.bak/
 ### Dynaconf config
 **/*.local.yml
 **/.secrets.yml
+*.csv
diff --git a/README.md b/README.md
@@ -91,7 +91,7 @@ Options:
                                2(thorough) for max_depth=2, default 1
   --max-page INTEGER           Max page number to crawl, default 100000
   --max-depth INTEGER          Max depth to crawl, default 1
-  -o, --outfile FILE           Output result to specified file
+  -o, --outfile FILE           Output result to specified file in csv format
   -s, --status TEXT            Filter response status to display, seperated by
                                commas, e.g. 200,300-400
   -x, --proxy TEXT             Set proxy, e.g. http://127.0.0.1:8080,
@@ -122,9 +122,9 @@ to `--max-depth 2`. By default the normal mode `-m 1` is adopted with max depth
 secretscraper -u https://scrapeme.live/shop/ -m 2
 ```
 
-#### Write Results to File
+#### Write Results to Csv File
 ```bash
-secretscraper -u https://scrapeme.live/shop/ -o result.log
+secretscraper -u https://scrapeme.live/shop/ -o result.csv
 ```
 
 #### Hide Regex Result
@@ -138,13 +138,23 @@ secretscraper -u https://scrapeme.live/shop/ -H
 secretscraper -l <dir or file>
 ```
 
+#### Switch to hyperscan
+I have implemented the regex matching functionality with both `hyperscan` and `re` module, `re` module is used as default, if you purse higher performance, you can switch to `hyperscan` by changing the `handler_type` to `hyperscan` in `settings.yml`.
+
+There are some pitfalls of `hyperscan` which you have to take caution to use it:
+1. not support regex group: you can not extract content by parentheses.
+2. different syntax from `re`
+
+You'd better write regex separately for the two regex engine.
+
 #### Customize Configuration
 The built-in config is shown as below. You can assign custom configuration via `-i settings.yml`.
 ```yaml
 verbose: false
 debug: false
 loglevel: critical
 logpath: log
+handler_type: re
 
 proxy: "" # http://127.0.0.1:7890
 max_depth: 1 # 0 for no limit
@@ -231,6 +241,10 @@ rules:
 ---
 
 # Change Log
+## 2024.5.25 Version 1.4
+- Support csv output
+- Set `re` module as regex engine by default
+- Support to select regex engine by configuration `handler_type`
 ## 2024.4.30 Version 1.3.9
 - Add `--validate` option: Validate urls after the crawler finish, which helps reduce useless links
 - Optimize url collector

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "secretscraper"
-version = "1.3.9.4"
+version = "1.4"
 description = "SecretScraper is a web scraper tool that can scrape the content through target websites and extract secret information via regular expression."
 readme = "README.md"
 authors = ["Padishah <[email protected]>"]

diff --git a/src/secretscraper/__init__.py b/src/secretscraper/__init__.py
@@ -1,3 +1,3 @@
 """SecretScraper"""
 
-__version__ = "1.3.9.4"
+__version__ = "1.4"
diff --git a/src/secretscraper/cmdline.py b/src/secretscraper/cmdline.py
@@ -1,10 +1,11 @@
 """Command line"""
-
+import dataclasses
 import functools
 import logging
 import pathlib
 
 import click
+import dynaconf
 from click import Context
 from dynaconf.base import Settings
 
@@ -17,6 +18,13 @@
 facade_obj = None
 
 
+@dataclasses.dataclass
+class ExternalEntry:
+    """Expose objects for external library"""
+    facade_obj: CrawlerFacade
+    facade_settings: dynaconf.Dynaconf
+
+
 # @click.group(invoke_without_command=True)
 # @click.pass_context
 @click.command()
@@ -78,7 +86,7 @@
 @click.option(
     "-o",
     "--outfile",
-    help="Output result to specified file",
+    help="Output result to specified file in csv format",
     type=click.Path(
         exists=False, file_okay=True, dir_okay=False, path_type=pathlib.Path
     ),
@@ -105,6 +113,10 @@
               type=click.Path(exists=True, file_okay=True, dir_okay=True, path_type=pathlib.Path))
 def main(**options):
     """Main commands"""
+    start(options)
+
+
+def start(options: dict):
     if options["version"]:
         click.echo(__version__)
         exit(0)
@@ -138,7 +150,9 @@ def main(**options):
         else:
             facade = CrawlerFacade(settings, options_dict, print_func=print_func)
         facade_obj = facade
+        ExternalEntry.facade_obj = facade
         facade_settings = facade.settings
+        ExternalEntry.facade_settings = facade_settings
     except FacadeException as e:
         click.echo(f"Error: {e}")
         exit(1)
@@ -157,6 +171,7 @@ def generate_configuration(file: pathlib.Path):
 debug: false
 loglevel: critical
 logpath: log
+handler_type: re
 
 proxy: "" # http://127.0.0.1:7890
 max_depth: 1 # 0 for no limit

diff --git a/src/secretscraper/crawler.py b/src/secretscraper/crawler.py
@@ -28,7 +28,7 @@
 
 from .config import settings
 from .exception import CrawlerException
-from .util import Range
+from .util import Range, get_response_title
 
 logger = logging.getLogger(__name__)
 
@@ -244,7 +244,12 @@ async def process_one(self, url_node: URLNode):
         response = await self.fetch(url_node.url)
         if response is not None:  # and response.status == 200
             url_node.response_status = str(response.status_code)
-
+            url_node.title = get_response_title(response)
+            try:
+                url_node.content_length = int(response.headers.get('content-length'))
+            except Exception:
+                pass
+            url_node.content_type = response.headers.get('content-type')
             response_text: str = response.text
             # try:
             #     response_text: str = await response.text(

diff --git a/src/secretscraper/entity.py b/src/secretscraper/entity.py
@@ -23,6 +23,9 @@ class URLNode:
     response_status: str = field(default="Unknown", hash=False, compare=False)
     depth: int = field(default=0, hash=False, compare=False)
     parent: typing.Optional["URLNode"] = field(default=None, hash=False, compare=False)
+    content_length: int = field(hash=False, compare=False, default=-1)
+    content_type: str = field(hash=False, compare=False, default="")
+    title: str = field(hash=False, compare=False, default="")
 
     def __post_init__(self):
         if self.parent is not None and self.depth <= self.parent.depth:

diff --git a/src/secretscraper/facade.py b/src/secretscraper/facade.py
@@ -30,11 +30,12 @@
 
 def print_func(f: typing.IO, func: typing.Callable, content: str, **kwargs) -> None:
     func(content, **kwargs)
-    func(content, file=f, **kwargs)
+    if f is not None:
+        func(content, file=f, **kwargs)
 
 
 def print_func_colorful(
-    f: typing.IO,
+    f: typing.Optional[typing.IO],
     func: typing.Callable,
     content: str,
     fg: str = None,
@@ -70,7 +71,7 @@ def __init__(
         self.custom_settings = custom_settings
         self.formatter = Formatter()
         self.hide_regex: bool = False
-        self.outfile = pathlib.Path(__file__).parent / "crawler.log"
+        self.outfile: typing.Optional[pathlib.Path] = None  # pathlib.Path(__file__).parent / "crawler.log"
         self.print_func = print_func
         self.debug: bool = False
         self.follow_redirects: bool = False
@@ -79,57 +80,62 @@ def __init__(
 
     def start(self):
         """Start the crawler and output"""
-        with self.outfile.open("w") as f:
-            try:
-
-                # print_func(f"Starting crawler...")
-                print_func_colorful(f,
-                                    self.print_func,
-                                    f"Target URLs: {', '.join(self.crawler.start_urls)}",
-                                    bold=True,
-                                    blink=True,
-                                    )
-                self.crawler.start()
-                self.crawler.start_validate()
-                if self.detail_output:
-                    # print_func_colorful(self.print_func,f"Total page: {self.crawler.total_page}")
-                    f.write(self.formatter.output_url_hierarchy(self.crawler.url_dict, True))
-
-                    if not self.hide_regex:
-                        print_func_colorful(f, self.print_func,
-                                            f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
-                                            )
-                    print_func_colorful(f, self.print_func, f"{self.formatter.output_js(self.crawler.js_dict)}")
-                    f.write(self.formatter.output_found_domains(list(self.crawler.found_urls), True))
-                else:
-                    # tidy output
-                    # URLs per domain
-                    domains = set()
-                    for url in self.crawler.start_urls:
-                        try:
-                            obj = urlparse(url)
-                            domain, _ = to_host_port(obj.netloc)
-                            if len(domain) > 0:
-                                domains.add(domain.strip())
-                        except:
-                            pass
-                    f.write(self.formatter.output_url_per_domain(domains, self.crawler.url_dict))
-                    # JS per domain
-                    f.write(self.formatter.output_url_per_domain(domains, self.crawler.js_dict, "JS"))
-                    # Domains
-                    f.write(self.formatter.output_found_domains(list(self.crawler.found_urls), True))
-                    # Secrets
-                    if not self.hide_regex:
-                        print_func_colorful(f, self.print_func,
-                                            f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
-                                            )
-            except KeyboardInterrupt:
-                self.print_func("\nExiting...")
-                self.crawler.close_all()
-            except Exception as e:
-                self.print_func(f"Unexpected error: {e}.\nExiting...")
-                self.crawler.close_all()
-                # raise FacadeException from e
+        # with self.outfile.open("w") as f:
+        f = None
+        try:
+
+            # print_func(f"Starting crawler...")
+            print_func_colorful(f,
+                                self.print_func,
+                                f"Target URLs: {', '.join(self.crawler.start_urls)}",
+                                bold=True,
+                                blink=True,
+                                )
+            self.crawler.start()
+            self.crawler.start_validate()
+            if self.detail_output:
+                # print_func_colorful(self.print_func,f"Total page: {self.crawler.total_page}")
+                self.formatter.output_url_hierarchy(self.crawler.url_dict, True)
+
+                if not self.hide_regex:
+                    print_func_colorful(f, self.print_func,
+                                        f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
+                                        )
+                print_func_colorful(f, self.print_func, f"{self.formatter.output_js(self.crawler.js_dict)}")
+                self.formatter.output_found_domains(list(self.crawler.found_urls), True)
+            else:
+                # tidy output
+                # URLs per domain
+                domains = set()
+                for url in self.crawler.start_urls:
+                    try:
+                        obj = urlparse(url)
+                        domain, _ = to_host_port(obj.netloc)
+                        if len(domain) > 0:
+                            domains.add(domain.strip())
+                    except:
+                        pass
+                self.formatter.output_url_per_domain(domains, self.crawler.url_dict)
+                # JS per domain
+                self.formatter.output_url_per_domain(domains, self.crawler.js_dict, "JS")
+                # Domains
+                self.formatter.output_found_domains(list(self.crawler.found_urls), True)
+                # Secrets
+                if not self.hide_regex:
+                    print_func_colorful(f, self.print_func,
+                                        f"{self.formatter.output_secrets(self.crawler.url_secrets)}"
+                                        )
+            if self.outfile is not None:
+                self.formatter.output_csv(self.outfile, self.crawler.url_dict, self.crawler.url_secrets)
+                print_func_colorful(None, self.print_func, f"Save result to csv file {self.outfile.name}", fg="green",
+                                    bold=True)
+        except KeyboardInterrupt:
+            self.print_func("\nExiting...")
+            self.crawler.close_all()
+        except Exception as e:
+            self.print_func(f"Unexpected error: {e}.\nExiting...")
+            self.crawler.close_all()
+            # raise FacadeException from e
 
     def create_crawler(self) -> Crawler:
         """Create a Crawler"""
@@ -260,7 +266,7 @@ def create_crawler(self) -> Crawler:
 
         # Read rules from config file
         rules: typing.Dict[str, str] = read_rules_from_setting(self.settings)
-        handler_type = self.settings.get("handler_type", "regex")
+        handler_type = self.settings.get("handler_type", "re")
         if handler_type == "hyperscan":
             handler = get_regex_handler(rules)
             print_config(f"Using regex handler: Hyperscan")