Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract mhtml content into Python dictionary #7

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ benchmark.py
*.mhtml

# Ignore 'NOTES' file in a case-insensitive manner
[Nn][Oo][Tt][Ee][Ss]
[Nn][Oo][Tt][Ee][Ss]
85 changes: 62 additions & 23 deletions MHTMLExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,40 @@ class MHTMLExtractor:
url_mapping (dict): A dictionary mapping original URLs to new filenames.
"""

def __init__(self, mhtml_path, output_dir, buffer_size=8192, clear_output_dir=False):
def __init__(self, mhtml_path=None, output_dir='./extracted_mhtml', buffer_size=8192, clear_output_dir=False, create_in_memory_output=False, create_output_files=True):
"""
Initialize the MHTMLExtractor class.

Args:
mhtml_path (str): Path to the MHTML document.
output_dir (str): Output directory for the extracted files.
output_dir (str, optional): Output directory for the extracted files. Default is `./extracted_mhtml`. It is relative to `MHTMLExtractor.py` file.
buffer_size (int, optional): Buffer size for reading the MHTML file. Defaults to 8192.
clear_output_dir (bool, optional): If True, clears the output directory before extraction. Defaults to False.
create_in_memory_output (bool, optional): If True, creates dict representation of files into `self.extracted_contents`.
create_output_files (bool, optional): If True, output files will be generated into `output_dir`.
"""
self.mhtml_path = mhtml_path
self.output_dir = output_dir
self.buffer_size = buffer_size
self.create_in_memory_output = create_in_memory_output
self.create_output_files = create_output_files
self.boundary = None
self.extracted_count = 0
self.url_mapping = {} # Mapping between Content-Location and new filenames
self.saved_html_files = [] # List to keep track of saved HTML filenames

self.ensure_directory_exists(self.output_dir, clear_output_dir)
# Example content:
# {
# 'example.com_c5e95188a491577c4f22329fd339b744.html': {
# 'content_type': 'text/html',
# 'decoded_body': b'<!DOCTYPE html><html>...</html>'
# },
# ...
# }
self.extracted_contents = {}

if self.create_output_files:
self.ensure_directory_exists(self.output_dir, clear_output_dir)

def ensure_directory_exists(self, directory_path, clear=False):
try:
Expand Down Expand Up @@ -219,8 +234,14 @@ def _process_part(self, part, no_css=False, no_images=False, html_only=False):
cid = "cid:" + content_id_match.group(1)
self.url_mapping[cid] = filename

# Write the content to a file
self._write_to_file(filename, content_type, decoded_body)
if self.create_in_memory_output:
self.extracted_contents[filename] = {
'content_type': content_type,
'decoded_body': decoded_body
}

if self.create_output_files:
self._write_to_file(filename, content_type, decoded_body)
except Exception as e:
logging.error(f"Error processing MHTML part: {e}")

Expand Down Expand Up @@ -260,28 +281,35 @@ def _update_html_links(self, filepath, sorted_urls, hash_pattern, no_css=False,
if html_only:
return

if self.create_output_files:
self._update_html_links_file_handler(content, filepath, sorted_urls, hash_pattern, no_css=False, no_images=False)
else:
content = self.extracted_contents[filepath]['decoded_body'].decode()
self.extracted_contents[filepath]['decoded_body'] = content

def _update_html_links_file_handler(self, content, filepath, sorted_urls, hash_pattern, no_css=False, no_images=False):
with open(filepath, "r", encoding="utf-8") as html_file:
content = html_file.read()

# For each original URL, replace it with the new filename in the content
for original_url in sorted_urls:
new_filename = self.url_mapping[original_url]

# Skip updating links for CSS files if no_css flag is set
if no_css and new_filename.endswith(".css"):
continue
# For each original URL, replace it with the new filename in the content
for original_url in sorted_urls:
new_filename = self.url_mapping[original_url]

# Skip updating links for image files if no_images flag is set
if no_images and any(new_filename.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"]):
continue
# Skip updating links for CSS files if no_css flag is set
if no_css and new_filename.endswith(".css"):
continue

matches = list(re.finditer(re.escape(original_url), content))
# Skip updating links for image files if no_images flag is set
if no_images and any(new_filename.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".svg"]):
continue

# Replace the links in the content
for match in reversed(matches):
if not hash_pattern.match(content, match.end()):
content = content[: match.start()] + new_filename + content[match.end() :]
matches = list(re.finditer(re.escape(original_url), content))

# Replace the links in the content
for match in reversed(matches):
if not hash_pattern.match(content, match.end()):
content = content[: match.start()] + new_filename + content[match.end() :]

with open(filepath, "w", encoding="utf-8") as html_file:
html_file.write(content)

Expand All @@ -291,12 +319,16 @@ def extract(self, no_css=False, no_images=False, html_only=False):
"""
temp_buffer_chunks = [] # Use a list to store chunks and join them when needed

is_stopped_with_brake = False

try:
with open(self.mhtml_path, "r", encoding="utf-8") as file:
# Continuously read from the MHTML file until no more content is left
while True:
chunk = file.read(self.buffer_size)
if not chunk:
# Note: If you are using `break` inside a `try-except` block, `break` will cause exception. To prevent error message in this case, we are using `is_stopped_with_brake` helper variable.
is_stopped_with_brake = True
break

"""
Expand Down Expand Up @@ -339,10 +371,17 @@ def extract(self, no_css=False, no_images=False, html_only=False):
for filename in self.saved_html_files:
filepath = os.path.join(self.output_dir, filename)
self._update_html_links(filepath, sorted_urls, hash_pattern)

logging.info(f"Extracted {self.extracted_count-1} files into {self.output_dir}")
except Exception as e:
logging.error(f"Error during extraction: {e}")
if not is_stopped_with_brake:
logging.error(f"Error during extraction: {e}")

if self.create_output_files:
near = ' (relative to `MHTMLExtractor.py` file)' if str(self.output_dir).startswith('./') else ''

logging.info(f"Extracted {self.extracted_count-1} files into {self.output_dir}{near}.")

if self.create_in_memory_output:
logging.info(f"Extracted {self.extracted_count-1} files content into `extracted_contents` property.")


if __name__ == "__main__":
Expand Down
64 changes: 62 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
## Features

- Extracts embedded files (e.g., CSS, images, JavaScript) from MHTML documents.
- Saves extracted files into phisycal files or into in-memory Python dictionary.
- Provides options to selectively skip extraction of certain file types.
- Handles potential filename conflicts by appending a counter.
- Efficient reading of large MHTML files through buffering.
Expand All @@ -15,7 +16,7 @@
- Python 3.x
- No external libraries are required.

## Usage
## Usage (CLI)

To use the MHTML Extractor, simply run the script and provide the necessary arguments:
```bash
Expand Down Expand Up @@ -44,7 +45,23 @@ optional arguments:

```

## Examples
## Usage (Python)

To use the MHTML Extractor, simply import the script and provide the necessary arguments:
```py
from MHTMLExtractor import MHTMLExtractor

extractor = MHTMLExtractor(
mhtml_path='example.mhtml',
output_dir='path/to/output/dir', # Optional, default is `./extracted_mhtml`. It is relative to `MHTMLExtractor.py` file.
create_in_memory_output=True, # Optional, default is False. If True, `extractor.extracted_contents` will be created, what contains extracted data.
create_output_files=False # Optional, default is True. If False, output files won't be created.
)

# ...
```

## Examples (CLI)

1. Extract all files from an MHTML document:
```bash
Expand All @@ -61,6 +78,49 @@ python mhtml_extractor.py example.mhtml --output_dir=./output
python mhtml_extractor.py example.mhtml --html_only
```

## Examples (Python):

1. In-memory mode (files won't be created):
```py
from MHTMLExtractor import MHTMLExtractor

extractor = MHTMLExtractor(
mhtml_path='example.mhtml',
create_in_memory_output=True,
create_output_files=False
)
extractor.extract()

# Extracted content available in `extractor.extracted_contents` dict.
for filename, details in extractor.extracted_contents.items():
print('=== Filename:', filename, '\n')
print('=== Content type:', details['content_type'], '\n')
print('=== Decoded content:', details['decoded_body'])

break
```

2. Both, in-memory mode and file mode:
```py
from MHTMLExtractor import MHTMLExtractor

extractor = MHTMLExtractor(
mhtml_path='example.mhtml',
output_dir='/path/to/output/dir', # Optional, default is `./extracted_mhtml`. It is relative to `MHTMLExtractor.py` file.
create_in_memory_output=True,
create_output_files=True,
)
extractor.extract()

# Extracted content available in `extractor.extracted_contents` dict.
for filename, details in extractor.extracted_contents.items():
print('=== Filename:', filename, '\n')
print('=== Content type:', details['content_type'], '\n')
print('=== Decoded content:', details['decoded_body'])

break
```

## Notes

- **Purpose**: This script is designed to extract files (like images, CSS, and HTML content) from MHTML documents. MHTML is a web page archive format that's used to combine multiple resources from a web page into a single file.
Expand Down