Merge pull request ReFirmLabs#800 from ReFirmLabs/xz_false_positive

Improved XZ signaturing, added support for malformed XZ streams
m-1-k-3 · Dec 14, 2024 · 9d7db82 · 9d7db82
2 parents ff02dc7 + 23b3aa5
commit 9d7db82
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 83 deletions.
diff --git a/src/signatures/xz.rs b/src/signatures/xz.rs
@@ -1,7 +1,8 @@
 use crate::common::is_offset_safe;
+use crate::extractors::lzma::lzma_decompress;
+use crate::extractors::sevenzip::sevenzip_extractor;
 use crate::signatures::common::{SignatureError, SignatureResult, CONFIDENCE_HIGH};
-use crate::structures::xz::{parse_xz_footer, parse_xz_header};
-use aho_corasick::AhoCorasick;
+use crate::structures::xz::parse_xz_header;
 
 /// Human readable description
 pub const DESCRIPTION: &str = "XZ compressed data";
@@ -23,69 +24,48 @@ pub fn xz_parser(file_data: &[u8], offset: usize) -> Result<SignatureResult, Sig
 
     let mut next_offset = offset;
     let mut previous_offset = None;
+    let mut stream_header_count = 0;
     let available_data = file_data.len() - offset;
 
     // XZ streams can be concatenated together, need to process them all to determine the size of an XZ file
     while is_offset_safe(available_data, next_offset, previous_offset) {
-        // Parse the next XZ header to get the header's size
+        // Parse the next XZ header to validate the header CRC
         match parse_xz_header(&file_data[next_offset..]) {
             Err(_) => break,
-            Ok(header_size) => {
-                match file_data.get(next_offset + header_size..) {
-                    None => break,
-                    Some(xz_stream_data) => {
-                        // Determine the size of the XZ stream data
-                        match xz_stream_size(xz_stream_data) {
-                            Err(_) => break,
-                            Ok(stream_size) => {
-                                previous_offset = Some(next_offset);
-                                next_offset += header_size + stream_size;
-                            }
-                        }
-                    }
+            Ok(_) => {
+                // Header is valid
+                stream_header_count += 1;
+
+                // Do an extraction dry-run to make sure the data decompresses correctly
+                let dry_run = lzma_decompress(file_data, next_offset, None);
+
+                // If dry run was a success, update the offset and size fields
+                if dry_run.success && dry_run.size.is_some() {
+                    previous_offset = Some(next_offset);
+                    next_offset += dry_run.size.unwrap();
+                    result.size += dry_run.size.unwrap();
+                // Else, report that the data is malformed and stop processing XZ streams
+                } else {
+                    // 7z may be able to at least partially extract malformed data streams
+                    result.preferred_extractor = Some(sevenzip_extractor());
+                    result.description = format!(
+                        "{}, valid header with malformed data stream",
+                        result.description
+                    );
+                    break;
                 }
             }
         }
     }
 
-    // If at least one valid header and one valid stream were identified,
-    // next_offset will be greater than the starting offset.
-    if next_offset > offset {
-        result.size = next_offset - offset;
-        result.description = format!("{}, total size: {} bytes", result.description, result.size);
-        return Ok(result);
-    }
-
-    Err(SignatureError)
-}
-
-/// XZ file format has detectable, verifiable, end-of-stream markers.
-fn xz_stream_size(xz_data: &[u8]) -> Result<usize, SignatureError> {
-    // The magic bytes we search for ("YZ") are actually 10 bytes into the footer header
-    const FOOTER_MAGIC_OFFSET: usize = 10;
-
-    /*
-     * Gotta grep for the end-of-stream magic bytes ("YZ").
-     * These are prone to false positives, but a valid footer includes a checksum,
-     * making false positive matches easy to filter out (see: parse_xz_footer).
-     */
-    let eof_pattern = vec![b"YZ"];
-    let grep = AhoCorasick::new(eof_pattern).unwrap();
-
-    // Find all matching patterns in the xz compressed data
-    for eof_match in grep.find_overlapping_iter(xz_data) {
-        let match_offset: usize = eof_match.start();
-        let footer_start: usize = match_offset - FOOTER_MAGIC_OFFSET;
-
-        // Footer must be 4-byte aligned
-        if (footer_start % 4) == 0 {
-            if let Some(footer_data) = xz_data.get(footer_start..) {
-                // Parse the stream footer
-                if let Ok(footer_size) = parse_xz_footer(footer_data) {
-                    return Ok(footer_start + footer_size);
-                }
-            }
+    // Return success if at least one valid XZ stream header was found
+    if stream_header_count > 0 {
+        // Only report the total size if we were able to determine the total size
+        if result.size > 0 {
+            result.description =
+                format!("{}, total size: {} bytes", result.description, result.size);
         }
+        return Ok(result);
     }
 
     Err(SignatureError)

diff --git a/src/structures/xz.rs b/src/structures/xz.rs
@@ -24,33 +24,3 @@ pub fn parse_xz_header(xz_data: &[u8]) -> Result<usize, StructureError> {
 
     Err(StructureError)
 }
-
-/// Parse and validate an XZ footer, returns the footer size
-pub fn parse_xz_footer(xz_data: &[u8]) -> Result<usize, StructureError> {
-    const FOOTER_SIZE: usize = 12;
-    const CRC_DATA_SIZE: usize = 6;
-    const CRC_START_INDEX: usize = 4;
-
-    let xz_footer_structure = vec![
-        ("footer_crc", "u32"),
-        ("backward_size", "u32"),
-        ("flags", "u16"),
-        ("magic", "u16"),
-    ];
-
-    // Parse the stream footer
-    if let Ok(xz_footer) = common::parse(xz_data, &xz_footer_structure, "little") {
-        // Calculate the start and end offsets of the CRC'd data
-        let crc_start = CRC_START_INDEX;
-        let crc_end = crc_start + CRC_DATA_SIZE;
-
-        // Validate the stream footer
-        if let Some(crc_data) = xz_data.get(crc_start..crc_end) {
-            if crc32(crc_data) == (xz_footer["footer_crc"] as u32) {
-                return Ok(FOOTER_SIZE);
-            }
-        }
-    }
-
-    Err(StructureError)
-}