diff --git a/Cargo.toml b/Cargo.toml index f6cc545..fb6d7c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bin2ml" -version = "0.2.6" +version = "0.2.7" edition = "2021" [dependencies] @@ -27,6 +27,7 @@ env_logger = "0.10.0" thiserror = "1.0.47" enum-as-inner = "0.6.0" ordered-float = { version = "4.2.0", features = ["serde"] } + [dependencies.petgraph] version = "0.6.2" features = ["serde-1"] diff --git a/src/afij.rs b/src/afij.rs index 18c6e43..eb43764 100644 --- a/src/afij.rs +++ b/src/afij.rs @@ -1,3 +1,4 @@ +use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; use serde_aux::prelude::*; use serde_json::Value; @@ -5,26 +6,26 @@ use serde_json::Value; #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct AFIJFunctionInfo { - pub offset: i64, + pub offset: u64, pub name: String, pub size: i128, #[serde(rename = "is-pure")] pub is_pure: String, - pub realsz: i64, + pub realsz: u64, pub noreturn: bool, - pub stackframe: i64, + pub stackframe: u64, pub calltype: String, - pub cost: i64, - pub cc: i64, - pub bits: i64, + pub cost: u64, + pub cc: u64, + pub bits: u64, #[serde(rename = "type")] pub type_field: String, - pub nbbs: i64, + pub nbbs: u64, #[serde(rename = "is-lineal")] pub is_lineal: bool, pub ninstrs: i64, pub edges: i64, - pub ebbs: i64, + pub ebbs: u64, pub signature: String, pub minbound: u64, pub maxbound: i128, @@ -125,3 +126,34 @@ impl From<&AFIJFunctionInfo> for AFIJFeatureSubset { } } } + +#[derive(Default, Debug, Clone, PartialEq, Hash, Serialize, Deserialize)] +pub struct AFIJFeatureSubsetExtended { + pub name: String, + pub ninstrs: i64, + pub edges: i64, + pub indegree: i64, + pub outdegree: i64, + pub nlocals: i64, + pub nargs: i64, + pub nbbs: u64, + pub avg_ins_bb: OrderedFloat, +} + +impl From<&AFIJFunctionInfo> for AFIJFeatureSubsetExtended { + fn from(src: &AFIJFunctionInfo) -> AFIJFeatureSubsetExtended { + let avg_ins_bbs = OrderedFloat::from(src.ninstrs as f32 / src.nbbs as f32); + + AFIJFeatureSubsetExtended { + name: src.name.clone(), + ninstrs: src.ninstrs, + edges: src.edges, + indegree: src.indegree.unwrap_or(0), + outdegree: src.outdegree.unwrap_or(0), + nlocals: src.nlocals.unwrap_or(0), + nargs: src.nargs.unwrap_or(0), + nbbs: src.nbbs, + avg_ins_bb: avg_ins_bbs, + } + } +} diff --git a/src/agcj.rs b/src/agcj.rs index 0c24c8e..d1d983e 100644 --- a/src/agcj.rs +++ b/src/agcj.rs @@ -7,10 +7,11 @@ use itertools::Itertools; use petgraph::prelude::Graph; use serde::{Deserialize, Serialize}; use std::fs::File; +use std::path::PathBuf; #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] -pub struct AGCJFunctionCallGraphs { +pub struct AGCJFunctionCallGraph { pub name: String, pub size: i64, pub imports: Option>, @@ -24,16 +25,20 @@ pub struct AGCJParsedObjects { pub nodes: Vec, } -impl AGCJFunctionCallGraphs { +impl AGCJFunctionCallGraph { fn graph_to_json_func_node( &self, - binary_name: &str, - output_path: &String, + binary_name: &PathBuf, + output_path: &PathBuf, networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { - let full_output_path = - get_save_file_path(binary_name, output_path, Some(type_suffix.to_string())); + let mut full_output_path = get_save_file_path( + binary_name, + output_path, + Some(type_suffix.to_string()), + None, + ); check_or_create_dir(&full_output_path); let mut function_name = self.name.clone(); @@ -43,13 +48,16 @@ impl AGCJFunctionCallGraphs { function_name = self.name[..75].to_string(); } - let filename = format!( - "{}/{}-{}.json", - full_output_path, function_name, type_suffix - ); + let filename = format!("{}-{}.json", function_name, type_suffix); + + // Normalise string for windows + let filename = filename.replace(&['(', ')', ',', '\"', ';', ':', '\''][..], ""); + full_output_path.push(filename); + + debug!("Filename to save graphs to: {:?}", full_output_path); serde_json::to_writer( - &File::create(filename).expect("Failed to create writer"), + &File::create(full_output_path).expect("Failed to create writer"), &networkx_graph, ) .expect("Unable to write JSON"); @@ -57,13 +65,17 @@ impl AGCJFunctionCallGraphs { fn graph_to_json_func_metadata_tiknib( &self, - binary_name: &str, - output_path: &String, + binary_name: &PathBuf, + output_path: &PathBuf, networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { - let full_output_path = - get_save_file_path(binary_name, output_path, Some(type_suffix.to_string())); + let full_output_path = get_save_file_path( + binary_name, + output_path, + Some(type_suffix.to_string()), + None, + ); check_or_create_dir(&full_output_path); let mut function_name = self.name.clone(); @@ -74,7 +86,7 @@ impl AGCJFunctionCallGraphs { } let filename = format!( - "{}/{}-{}.json", + "{:?}/{}-{}.json", full_output_path, function_name, type_suffix ); @@ -87,15 +99,19 @@ impl AGCJFunctionCallGraphs { fn graph_to_json_func_metadata_finfo( &self, - binary_name: &str, - output_path: &String, + binary_name: &PathBuf, + output_path: &PathBuf, networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { - let full_output_path = - get_save_file_path(binary_name, output_path, Some(type_suffix.to_string())); + let mut full_output_path = get_save_file_path( + binary_name, + output_path, + Some(type_suffix.to_string()), + None, + ); check_or_create_dir(&full_output_path); - + debug!("Built Path: {:?}", full_output_path); let mut function_name = self.name.clone(); // This is a pretty dirty fix and may break things @@ -103,13 +119,14 @@ impl AGCJFunctionCallGraphs { function_name = self.name[..75].to_string(); } - let filename = format!( - "{}/{}-{}.json", - full_output_path, function_name, type_suffix - ); + let filename = format!("{}-{}.json", function_name, type_suffix); + // Normalise string for windows + let filename = filename.replace(&['(', ')', ',', '\"', ';', ':', '\''][..], ""); + full_output_path.push(filename); + debug!("Attempting to save to {:?}", full_output_path); serde_json::to_writer( - &File::create(filename).expect("Failed to create writer"), + &File::create(full_output_path).expect("Failed to create writer"), &networkx_graph, ) .expect("Unable to write JSON"); @@ -148,7 +165,7 @@ impl AGCJFunctionCallGraphs { trace!("Imports: {:?}", self.imports); for import in self.imports.as_ref().unwrap().iter() { trace! {"Starting to Process {:?}", import}; - let import_object: &Vec<&AGCJFunctionCallGraphs> = &global_cg + let import_object: &Vec<&AGCJFunctionCallGraph> = &global_cg .function_call_graphs .as_ref() .unwrap() @@ -226,8 +243,8 @@ impl AGCJFunctionCallGraphs { pub fn to_petgraph( &self, global_cg: &AGCJFile, - output_path: &String, - binary_name: &str, + output_path: &PathBuf, + binary_name: &PathBuf, with_metadata: &bool, include_unk: &bool, node_feature_type: Option, @@ -250,8 +267,8 @@ impl AGCJFunctionCallGraphs { pub fn one_hop_to_petgraph( &self, global_cg: &AGCJFile, - output_path: &String, - binary_name: &str, + output_path: &PathBuf, + binary_name: &PathBuf, with_metadata: &bool, include_unk: &bool, node_feature_type: Option, @@ -273,8 +290,8 @@ impl AGCJFunctionCallGraphs { pub fn to_petgraph_with_callers( &self, global_cg: &AGCJFile, - output_path: &String, - binary_name: &str, + output_path: &PathBuf, + binary_name: &PathBuf, with_metadata: &bool, include_unk: &bool, node_feature_type: Option, @@ -296,8 +313,8 @@ impl AGCJFunctionCallGraphs { pub fn one_hop_to_petgraph_with_callers( &self, global_cg: &AGCJFile, - output_path: &String, - binary_name: &str, + output_path: &PathBuf, + binary_name: &PathBuf, with_metadata: &bool, include_unk: &bool, node_feature_type: Option, @@ -327,8 +344,8 @@ impl AGCJFunctionCallGraphs { &self, graph: Graph, global_cg: &AGCJFile, - binary_name: &str, - output_path: &String, + binary_name: &PathBuf, + output_path: &PathBuf, with_metadata: &bool, node_feature_type: Option, type_suffix: &str, @@ -384,13 +401,13 @@ impl AGCJFunctionCallGraphs { #[cfg(test)] mod tests { use crate::files::AGCJFile; - use env_logger; + use std::path::PathBuf; fn return_test_file_oject() -> AGCJFile { let mut call_graph_file = AGCJFile { - filename: "test-files/ls_cg.json".to_string(), + filename: PathBuf::from("test-files/ls_cg.json"), function_call_graphs: None, - output_path: "".to_string(), + output_path: PathBuf::new(), function_metadata: None, include_unk: false, }; @@ -402,7 +419,7 @@ mod tests { } #[test] fn test_function_call_graph_without_unks() { - let mut call_graph_file = return_test_file_oject(); + let call_graph_file = return_test_file_oject(); // Get main function - No Unks let raw_call_graph_data = &call_graph_file.function_call_graphs.clone().unwrap()[0]; @@ -417,7 +434,7 @@ mod tests { #[test] fn test_function_call_graph_with_callees_without_unks() { - let mut call_graph_file = return_test_file_oject(); + let call_graph_file = return_test_file_oject(); // Unk False let raw_call_graph_data = &call_graph_file.function_call_graphs.clone().unwrap()[0]; @@ -452,7 +469,7 @@ mod tests { #[test] fn test_function_call_graph_with_callees_with_unks() { - let mut call_graph_file = return_test_file_oject(); + let call_graph_file = return_test_file_oject(); // sym.func.100004d11 - One unknown let raw_call_graph_data = &call_graph_file.function_call_graphs.clone().unwrap()[2]; @@ -471,7 +488,7 @@ mod tests { #[test] fn test_function_call_graph_callees_and_callers_with_unks() { - let mut call_graph_file = return_test_file_oject(); + let call_graph_file = return_test_file_oject(); // sym.func.100004d11 - One unknown let raw_call_graph_data = &call_graph_file.function_call_graphs.clone().unwrap()[2]; diff --git a/src/agfj.rs b/src/agfj.rs index 83c52aa..dbb5783 100644 --- a/src/agfj.rs +++ b/src/agfj.rs @@ -9,8 +9,10 @@ use petgraph::prelude::Graph; use petgraph::visit::Dfs; use serde::{Deserialize, Serialize}; use serde_json; +#[cfg(feature = "inference")] +use serde_json::{Map, Value}; use std::fs::File; -use std::path::Path; +use std::path::{Path, PathBuf}; #[cfg(feature = "inference")] use std::process::exit; #[cfg(feature = "inference")] @@ -239,14 +241,14 @@ impl AGFJFunc { #[cfg(feature = "inference")] pub fn generate_embedded_cfg( &self, - path: &str, + path: &PathBuf, min_blocks: &u16, - output_path: &String, + output_path: &PathBuf, feature_type: FeatureType, inference_job: &Option>, ) { info!("Processing {:?}", self.name); - let full_output_path = get_save_file_path(path, output_path, None); + let full_output_path = get_save_file_path(path, output_path, None, None); check_or_create_dir(&full_output_path); // offset != 1 has been added to skip functions with invalid instructions @@ -303,11 +305,11 @@ impl AGFJFunc { ) }; - let file_name = path.split('/').last().unwrap(); + let file_name = path.file_name().unwrap(); let binary_name: Vec<_> = file_name.split(".j").collect(); let fname_string = format!( - "{}/{}-{}.json", + "{:?}/{:?}-{}.json", &full_output_path, binary_name[0], self.name ); serde_json::to_writer( @@ -323,25 +325,27 @@ impl AGFJFunc { pub fn generate_attributed_cfg( &self, - path: &str, + path: &PathBuf, min_blocks: &u16, - output_path: &String, + output_path: &PathBuf, feature_type: FeatureType, architecture: &String, ) { - let full_output_path = get_save_file_path(path, output_path, None); + let full_output_path = get_save_file_path(path, output_path, None, None); check_or_create_dir(&full_output_path); - let file_name = path.split('/').last().unwrap(); - let binary_name: Vec<_> = file_name.split(".j").collect(); - let mut function_name = self.name.clone(); + let file_name = path.file_name().unwrap(); + let binding = file_name.to_string_lossy().to_string(); - // This is a pretty dirty fix and may break things - if function_name.chars().count() > 100 { - function_name = self.name[..75].to_string(); - } + let binary_name: Vec<_> = binding.split(".j").collect(); + + let function_name = if self.name.chars().count() > 100 { + &self.name[..75] + } else { + &self.name + }; let fname_string = format!( - "{}/{}-{}.json", + "{:?}/{:?}-{}.json", &full_output_path, binary_name[0], function_name ); @@ -541,6 +545,7 @@ impl From<(&String, Vec)> for TikNibFunc { #[cfg(test)] mod tests { use crate::bb::FeatureType; + use std::path::PathBuf; use crate::AGFJFile; @@ -551,11 +556,11 @@ mod tests { #[test] fn file_struct_creation() { - let file_path = "../sample-tool-outputs/r2/example_agfj@@F_output.json".to_string(); + let file_path = PathBuf::from("../sample-tool-outputs/r2/example_agfj@@F_output.json"); let file = AGFJFile { functions: None, filename: file_path.to_owned(), - output_path: "output.json".to_string(), + output_path: PathBuf::from("output.json"), min_blocks: 5, feature_type: Some(crate::bb::FeatureType::Gemini), architecture: None, @@ -565,20 +570,20 @@ mod tests { assert!(file.functions.is_none()); assert_eq!( file.filename, - "../sample-tool-outputs/r2/example_agfj@@F_output.json".to_string() + PathBuf::from("../sample-tool-outputs/r2/example_agfj@@F_output.json") ); - assert_eq!(file.output_path, "output.json".to_string()); + assert_eq!(file.output_path, PathBuf::from("output.json")); assert_eq!(file.min_blocks, 5); assert_eq!(file.feature_type, Some(FeatureType::Gemini)); } #[test] fn test_file_load_and_desearlize() { - let file_path = "test-files/r2-output-samples/example_agfj@@F_output.json".to_string(); + let file_path = PathBuf::from("test-files/r2-output-samples/example_agfj@@F_output.json"); let mut file = AGFJFile { functions: None, filename: file_path.to_owned(), - output_path: "output.json".to_string(), + output_path: PathBuf::from("output.json"), min_blocks: 5, feature_type: Some(crate::bb::FeatureType::Gemini), architecture: None, @@ -653,11 +658,11 @@ mod tests { #[test] fn test_func_edge_list_generation() { - let file_path = "test-files/r2-output-samples/test_bin_agfj.json".to_string(); + let file_path = PathBuf::from("test-files/r2-output-samples/test_bin_agfj.json"); let mut file = AGFJFile { functions: None, filename: file_path.to_owned(), - output_path: "output.json".to_string(), + output_path: PathBuf::from("output.json"), min_blocks: 5, feature_type: Some(crate::bb::FeatureType::Gemini), architecture: None, diff --git a/src/binnfo.rs b/src/binnfo.rs index ce953c6..e020581 100644 --- a/src/binnfo.rs +++ b/src/binnfo.rs @@ -1,10 +1,9 @@ use goblin::{error, Object}; use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; -pub fn goblin_info(fpath: &str) -> error::Result<()> { - let path = Path::new(fpath); - let buffer = fs::read(path)?; +pub fn goblin_info(fpath: &PathBuf) -> error::Result<()> { + let buffer = fs::read(fpath)?; match Object::parse(&buffer)? { Object::Elf(elf) => { println!("elf: {:#?}", &elf); diff --git a/src/combos.rs b/src/combos.rs new file mode 100644 index 0000000..e61be24 --- /dev/null +++ b/src/combos.rs @@ -0,0 +1,54 @@ +use crate::afij::AFIJFeatureSubset; +use crate::agfj::TikNibFuncFeatures; + +use ordered_float::OrderedFloat; +pub struct FinfoTiknib { + pub name: String, + pub edges: i64, + pub indegree: i64, + pub outdegree: i64, + pub nlocals: i64, + pub nargs: i64, + pub avg_arithshift: OrderedFloat, + pub avg_compare: OrderedFloat, + pub avg_ctransfer: OrderedFloat, + pub avg_ctransfercond: OrderedFloat, + pub avg_dtransfer: OrderedFloat, + pub avg_float: OrderedFloat, + pub avg_total: OrderedFloat, + // Sum + pub sum_arithshift: OrderedFloat, + pub sum_compare: OrderedFloat, + pub sum_ctransfer: OrderedFloat, + pub sum_ctransfercond: OrderedFloat, + pub sum_dtransfer: OrderedFloat, + pub sum_float: OrderedFloat, + pub sum_total: OrderedFloat, +} + +impl From<(AFIJFeatureSubset, TikNibFuncFeatures)> for FinfoTiknib { + fn from(value: (AFIJFeatureSubset, TikNibFuncFeatures)) -> Self { + FinfoTiknib { + name: value.0.name, + edges: value.0.edges, + indegree: value.0.indegree, + outdegree: value.0.outdegree, + nlocals: value.0.nlocals, + nargs: value.0.nargs, + avg_arithshift: value.1.avg_arithshift, + avg_compare: value.1.avg_compare, + avg_ctransfer: value.1.avg_ctransfer, + avg_ctransfercond: value.1.avg_ctransfercond, + avg_dtransfer: value.1.avg_dtransfer, + avg_float: value.1.avg_float, + avg_total: value.1.avg_total, + sum_arithshift: value.1.sum_arithshift, + sum_compare: value.1.sum_compare, + sum_ctransfer: value.1.sum_ctransfer, + sum_ctransfercond: value.1.sum_ctransfercond, + sum_dtransfer: value.1.sum_dtransfer, + sum_float: value.1.sum_float, + sum_total: value.1.sum_total, + } + } +} diff --git a/src/dedup.rs b/src/dedup.rs index 4647c0e..4c57ed2 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -9,9 +9,9 @@ use serde::{Deserialize, Serialize}; use serde_json::json; use std::collections::hash_map::DefaultHasher; use std::collections::{HashMap, HashSet}; -use std::fs::{read_to_string, File}; +use std::fs::{read_dir, read_to_string, File}; use std::hash::{Hash, Hasher}; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::string::String; use std::{fs, vec}; @@ -89,12 +89,12 @@ pub struct EsilFuncStringCorpus { pub binary_name_index: Vec, pub uniq_binaries: Vec, pub arch_index: Vec, - pub output_path: String, + pub output_path: PathBuf, } /// A collection of processed Esil Function String files impl EsilFuncStringCorpus { - pub fn new(directory: &String, output_path: &String) -> Result { + pub fn new(directory: &PathBuf, output_path: &PathBuf) -> Result { let mut filepaths = Vec::new(); let mut binary_name_index = Vec::new(); let mut uniq_binaries = Vec::new(); @@ -123,10 +123,9 @@ impl EsilFuncStringCorpus { } } - let output_path: String = if !output_path.ends_with('/') { - format!("{}{}", output_path, "/") - } else { - output_path.to_string() + let mut output_path = output_path.to_owned(); + if !output_path.to_string_lossy().to_string().ends_with('/') { + output_path.push("/"); }; Ok(EsilFuncStringCorpus { @@ -135,7 +134,7 @@ impl EsilFuncStringCorpus { binary_name_index, uniq_binaries, arch_index, - output_path, + output_path: output_path.to_owned(), }) } @@ -235,7 +234,7 @@ impl EsilFuncStringCorpus { } /// Generate hash statistics from a func hash tuple collection - fn hash_stats(&self, original_len: usize, unique_func_has_tuples: &Vec) { + fn hash_stats(&self, original_len: usize, unique_func_has_tuples: &[DedupEntry]) { let unique_len = unique_func_has_tuples.len(); let percent_difference: f32 = ((original_len as f32 - unique_len as f32) / original_len as f32) * 100.0; @@ -278,7 +277,7 @@ impl EsilFuncStringCorpus { if !just_stats { let uniques_to_drop = json!(unique_func_hash_tuples); - let fname_string = format!("{}{}-dedup.json", self.output_path, &target_binary_name); + let fname_string = format!("{:?}{}-dedup.json", self.output_path, &target_binary_name); serde_json::to_writer( &File::create(fname_string).expect("Failed to create writer"), &uniques_to_drop, @@ -291,25 +290,29 @@ impl EsilFuncStringCorpus { /// Struct and Impl for de-duplicating Call Graph Corpus's #[derive(Debug)] pub struct CGCorpus { - pub filepaths: Vec, - pub output_path: String, + pub filepaths: Vec, + pub output_path: PathBuf, pub filepath_format: String, pub node_type: CallGraphNodeFeatureType, } impl CGCorpus { pub fn new( - directory: &String, - output_path: &String, + directory: &PathBuf, + output_path: &PathBuf, filepath_format: &String, node_type: CallGraphNodeFeatureType, ) -> Result { - if !Path::new(output_path).exists() { - fs::create_dir(output_path).expect("Failed to create output directory!"); - info!("Output path not found - Creating {}", output_path) + if !output_path.exists() { + let ret = fs::create_dir(output_path); + if ret.is_ok() { + info!("Output path not found - Creating {:?}", output_path) + } else { + info!("Output path {:?} found", output_path) + } } - let mut filepaths = Vec::new(); + let mut filepaths: Vec = Vec::new(); // Load all JSON filepaths for file in WalkDir::new(directory) @@ -317,21 +320,16 @@ impl CGCorpus { .filter_map(|file| file.ok()) { if file.path().to_string_lossy().ends_with(".json") { - filepaths.push(file.clone().path().to_string_lossy().to_string()); + filepaths.push(PathBuf::from(file.clone().path())); } } info!("Returning One Hop CG Corpus Struct"); - - let output_path = if output_path.ends_with('/') { - output_path.to_owned() - } else { - output_path.to_owned() + &*"/".to_string() - }; + let output_path = output_path.to_owned(); Ok(CGCorpus { filepaths, - output_path: output_path.to_string(), + output_path, filepath_format: filepath_format.to_string(), node_type, }) @@ -344,7 +342,7 @@ impl CGCorpus { } //fn dedup_corpus(data: &mut Vec>, filepaths: &mut Vec) { - fn dedup_corpus(data: &mut Vec>, filepaths: &mut Vec) { + fn dedup_corpus(data: &mut Vec>, filepaths: &mut Vec) { debug!("Creating the removal index"); let mut seen = HashSet::new(); @@ -365,29 +363,49 @@ impl CGCorpus { } } - fn get_binary_name_cisco(filepath: &String) -> String { + fn dedup_corpus_inplace(data: &mut [Option], filepaths: &mut [PathBuf]) { + let mut seen = HashSet::new(); + for (i, data_ele) in data.iter().enumerate() { + let hash_value = Self::calculate_hash(&data_ele); + + if seen.contains(&hash_value) { + let ret = fs::remove_file(&filepaths[i]); + if ret.is_ok() { + debug!("Sucessfully removed graph"); + } else { + error!("Unable to remove - {:?}", ret); + } + } else { + seen.insert(hash_value); + } + } + } + + fn get_binary_name_cisco(filepath: &PathBuf) -> PathBuf { // Example: x86-gcc-9-O3_nping_cg-onehopcgcallers-meta let binary_intermediate = Path::new(filepath).parent().unwrap().file_name().unwrap(); - binary_intermediate - .to_string_lossy() - .split('_') - .nth(1) - .unwrap() - .to_string() - } - fn get_binary_name_binkit(filepath: &String) -> String { + PathBuf::from( + binary_intermediate + .to_string_lossy() + .split('_') + .nth(1) + .unwrap(), + ) + } + fn get_binary_name_binkit(filepath: &PathBuf) -> PathBuf { // Example: tar-1.34_gcc-8.2.0_x86_32_O3_rmt_cg-onehopcgcallers-meta let binary_intermediate = Path::new(filepath).parent().unwrap().file_name().unwrap(); - binary_intermediate - .to_string_lossy() - .split('_') - .rev() - .nth(1) - .unwrap() - .to_string() + PathBuf::from( + binary_intermediate + .to_string_lossy() + .split('_') + .rev() + .nth(1) + .unwrap(), + ) } - fn extract_binary_from_fps(&self) -> Vec { + fn extract_binary_from_fps(&self) -> Vec { let mut fp_binaries = Vec::new(); // Process the file paths to get the associated binary of each path info!("Processing Filepaths to get binaries"); @@ -404,10 +422,10 @@ impl CGCorpus { fp_binaries } - fn get_unique_binary_fps(&self, fp_binaries: Vec) -> Vec> { + fn get_unique_binary_fps(&self, fp_binaries: Vec) -> Vec> { // Generate binary specific filepath vectors - let unique_binaries: Vec<_> = fp_binaries.iter().unique().collect(); - let mut unique_binaries_fps: Vec> = vec![Vec::new(); unique_binaries.len()]; + let unique_binaries: Vec<&PathBuf> = fp_binaries.iter().unique().collect(); + let mut unique_binaries_fps: Vec> = vec![Vec::new(); unique_binaries.len()]; for (file, binary) in self.filepaths.iter().zip(fp_binaries.iter()) { unique_binaries_fps[unique_binaries.iter().position(|&x| x == binary).unwrap()] @@ -417,30 +435,34 @@ impl CGCorpus { unique_binaries_fps } - fn load_subset(&self, fp_subset: &[String]) -> Vec> { + fn load_subset(&self, fp_subset: &[PathBuf]) -> Vec> { let mut subset_loaded_data = Vec::new(); for ele in fp_subset.iter() { let data = read_to_string(ele).expect(&format!("Unable to read file - {:?}", ele)); - let json = serde_json::from_str::(&data) - .expect(&format!("Unable to load function data from {}", ele)); + let json = serde_json::from_str::(&data); - let nodes_empty = match self.node_type { - CallGraphNodeFeatureType::CGName => json.as_cg_name().unwrap().nodes.is_empty(), - CallGraphNodeFeatureType::CGMeta => json.as_cg_meta().unwrap().nodes.is_empty(), - CallGraphNodeFeatureType::TikNib => json.as_tik_nib().unwrap().nodes.is_empty(), - }; + if json.is_ok() { + let json = json.unwrap(); + let nodes_empty = match self.node_type { + CallGraphNodeFeatureType::CGName => json.as_cg_name().unwrap().nodes.is_empty(), + CallGraphNodeFeatureType::CGMeta => json.as_cg_meta().unwrap().nodes.is_empty(), + CallGraphNodeFeatureType::TikNib => json.as_tik_nib().unwrap().nodes.is_empty(), + }; - if !nodes_empty { - subset_loaded_data.push(Some(json)) + if !nodes_empty { + subset_loaded_data.push(Some(json)) + } else { + subset_loaded_data.push(None) + } } else { - subset_loaded_data.push(None) + error!("Unable to load {:?}", ele); } } subset_loaded_data } - pub fn process_corpus(self) { + pub fn process_corpus(&self) { let fp_binaries = self.extract_binary_from_fps(); // Generate binary specific filepath vectors @@ -463,30 +485,99 @@ impl CGCorpus { debug!("File processing complete - {}", idx); }); } - pub fn save_corpus(&self, subset_loaded_data: Vec, fp_subset: &mut [String]) { + + pub fn process_corpus_inplace(&self) { + let fp_binaries = self.extract_binary_from_fps(); + + // Generate binary specific filepath vectors + let mut unique_binaries_fps = self.get_unique_binary_fps(fp_binaries); + + info!("Loading the filepaths"); + unique_binaries_fps + .par_iter_mut() + .progress() + .enumerate() + .for_each(|(idx, fp_subset)| { + debug!("Subset Length: {}", fp_subset.len()); + if fp_subset.len() > 3500000 { + info!("Encountered a binary subset with more than 2.5M graphs. Chunking. Will have to repeat!"); + let mut chunked: Vec<_> = fp_subset + .chunks(2000000) + .map(|s| { + let mut inner_vec = Vec::new(); + for ele in s { + inner_vec.push(ele.to_owned()); + } + inner_vec + }) + .collect(); + + info!("Created {} chunks of 2M (approx.)", chunked.len()); + for (i, ele) in chunked.iter_mut().enumerate() { + let mut subset_loaded_data: Vec> = + self.load_subset(ele); + debug!("Starting to deduplicate chunk {} for corpus {}", i, idx); + Self::dedup_corpus_inplace(&mut subset_loaded_data, ele); + } + } else { + let mut subset_loaded_data: Vec> = + self.load_subset(fp_subset); + debug!("Starting to deduplicate the corpus - {}", idx); + Self::dedup_corpus_inplace(&mut subset_loaded_data, fp_subset); + } + }); + + Self::clean_up_empty_dirs(&self.output_path); + } + + fn clean_up_empty_dirs(output_path: &PathBuf) { + for dir in WalkDir::new(output_path) + .into_iter() + .filter_map(|file| file.ok()) + { + if dir.path().is_dir() { + let path = dir.path(); + let dir_ret = read_dir(path); + if dir_ret.is_ok() { + let is_empty = dir_ret.unwrap().next().is_none(); + if is_empty { + let ret = fs::remove_dir(dir.path()); + if ret.is_ok() { + debug!("Successfully removed {:?}", dir.path()); + } else { + error!("Tried to remove {:?} but failed", dir.path()); + } + }; + } + } + } + } + + fn generate_dedup_filepath(output_path: &Path, filepath: &Path) -> PathBuf { + let first_two = filepath.components().rev().take(2).collect::>(); + let first_two: PathBuf = first_two.iter().rev().collect(); + let output = output_path.to_path_buf(); + let mut final_path = PathBuf::new(); + final_path.push(output); + final_path.push(first_two); + + final_path + } + pub fn save_corpus( + &self, + subset_loaded_data: Vec, + fp_subset: &mut [PathBuf], + ) { subset_loaded_data .iter() .zip(fp_subset.iter()) .for_each(|(data_ele, filepath)| { - let fixed_path: Vec<_> = Path::new(filepath) - .components() - .rev() - .take(2) - .collect::>(); - trace!("Fixed Path (First Pass): {:?}", fixed_path); - let fixed_path = fixed_path - .iter() - .map(|c| c.as_os_str().to_string_lossy().to_string()) - .rev() - .collect::>(); - trace!("Fixed Path (Second Pass): {:?}", fixed_path); - let dirs = format!("{}{}", self.output_path, fixed_path[0]); - fs::create_dir_all(&dirs).expect("Failed to create output directory!"); + let save_path = Self::generate_dedup_filepath(&self.output_path, filepath); + let dirs = save_path.parent().unwrap_or(Path::new("")); + fs::create_dir_all(dirs).expect("Failed to create output directory!"); - let fixed_path = format!("{}/{}", dirs, fixed_path[1]); - trace!("Fixed Path (Final Pass): {:?}", fixed_path); serde_json::to_writer( - &File::create(fixed_path).expect("Failed to create writer"), + &File::create(save_path).expect("Failed to create writer"), &data_ele, ) .expect("Unable to write JSON"); @@ -494,6 +585,7 @@ impl CGCorpus { } } +#[cfg(test)] mod tests { use crate::dedup::CGCorpus; use crate::networkx::{ @@ -501,7 +593,7 @@ mod tests { }; use std::fs; use std::fs::read_to_string; - use std::path::Path; + use std::path::{Path, PathBuf}; use walkdir::WalkDir; // Test Dedup on typed CG's @@ -509,90 +601,113 @@ mod tests { fn test_cg_corpus_gen() { // CG Corpus Generation let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &mut PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGName, ); - assert_eq!(corpus.as_ref().unwrap().filepaths.len(), 12); - assert_eq!( - corpus.as_ref().unwrap().output_path, - "test-files/cg_dedup/deduped/".to_string() - ); - assert_eq!( - corpus.as_ref().unwrap().filepath_format, - "cisco".to_string() - ); - let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped/".to_string(), - &"cisco".to_string(), - CallGraphNodeFeatureType::CGName, - ); - assert_eq!(corpus.as_ref().unwrap().filepaths.len(), 12); - assert_eq!( - corpus.as_ref().unwrap().output_path, - "test-files/cg_dedup/deduped/".to_string() - ); - assert_eq!( - corpus.as_ref().unwrap().filepath_format, - "cisco".to_string() - ); + if corpus.is_ok() { + let corpus = corpus.unwrap(); + assert_eq!(corpus.filepaths.len(), 12); + assert_eq!( + corpus.output_path, + PathBuf::from("test-files/cg_dedup/deduped/") + ); + assert_eq!(corpus.filepath_format, "cisco".to_string()); + + // clean up + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + }; + + let corpus = CGCorpus::new( + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &PathBuf::from("test-files/cg_dedup/deduped/"), + &"cisco".to_string(), + CallGraphNodeFeatureType::CGName, + ); + if corpus.is_ok() { + let corpus = corpus.unwrap(); + assert_eq!(corpus.filepaths.len(), 12); + assert_eq!( + corpus.output_path, + PathBuf::from("test-files/cg_dedup/deduped/") + ); + assert_eq!(corpus.filepath_format, "cisco".to_string()); + // clean up + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + } + } + } } #[test] fn test_extract_binary_from_fps() { let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGMeta, ); - let fp_binaries = corpus.unwrap().extract_binary_from_fps(); - assert_eq!(fp_binaries.len(), 12); - assert_eq!( - fp_binaries, - vec![ - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin2".to_string(), - "testbin2".to_string(), - "testbin2".to_string(), - "testbin2".to_string(), - ] - ) + if corpus.is_ok() { + let corpus = corpus.unwrap(); + let fp_binaries = corpus.extract_binary_from_fps(); + assert_eq!(fp_binaries.len(), 12); + assert_eq!( + fp_binaries, + vec![ + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin2"), + PathBuf::from("testbin2"), + PathBuf::from("testbin2"), + PathBuf::from("testbin2"), + ] + ); + // clean up + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + } + } } #[test] fn test_get_unique_binary_fps() { let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &mut PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGMeta, ) .unwrap(); + let fp_binaries = corpus.extract_binary_from_fps(); let unique_binary_fps = corpus.get_unique_binary_fps(fp_binaries); assert_eq!(unique_binary_fps.len(), 2); assert_eq!(unique_binary_fps[0].len(), 8); assert_eq!(unique_binary_fps[1].len(), 4); + + // clean up + if corpus.output_path.is_dir() { + let _ = fs::remove_dir_all(&corpus.output_path); + } } #[test] fn test_processing_unique_binary_collection() { let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &mut PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGMeta, ) @@ -606,13 +721,18 @@ mod tests { assert_eq!(subset_loaded.len(), 8); subset_loaded.retain(|c| c.is_some()); assert_eq!(subset_loaded.len(), 8); + + // clean up + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + } } #[test] fn test_dedup_binary_subset() { let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &mut PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGMeta, ) @@ -688,7 +808,9 @@ mod tests { } // clean up - fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + } } // Test binary name extraction @@ -696,24 +818,24 @@ mod tests { fn test_binkit_binary_extraction() { assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"which-2.21_gcc-9.4.0_arm_32_O2_which_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string() + &PathBuf::from("which-2.21_gcc-9.4.0_arm_32_O2_which_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +") ), - "which" + PathBuf::from("which") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"recutils-1.9_gcc-11.2.0_mips_64_O3_recins_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string() + &PathBuf::from("recutils-1.9_gcc-11.2.0_mips_64_O3_recins_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +") ), - "recins" + PathBuf::from("recins") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"recutils-1.9_gcc-11.2.0_mips_64_O3_recsel_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string(), + &PathBuf::from("recutils-1.9_gcc-11.2.0_mips_64_O3_recsel_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +"), ), - "recsel", + PathBuf::from("recsel"), ); } @@ -721,30 +843,30 @@ mod tests { fn test_cisco_binary_extraction() { assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm64-clang-9-Os_curl_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm64-clang-9-Os_curl_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json") ), - "curl" + PathBuf::from("curl") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"x86-clang-9-Os_libcrypto.so.3_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string() + &PathBuf::from("x86-clang-9-Os_libcrypto.so.3_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +") ), - "libcrypto.so.3" + PathBuf::from("libcrypto.so.3") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"x86-gcc-9-O3_unrar_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string(), + &PathBuf::from("x86-gcc-9-O3_unrar_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +"), ), - "unrar", + PathBuf::from("unrar"), ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"/random/path/before/x86-gcc-9-O3_unrar_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string(), + &PathBuf::from("/random/path/before/x86-gcc-9-O3_unrar_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +"), ), - "unrar", + PathBuf::from("unrar"), ); } @@ -752,37 +874,37 @@ mod tests { fn test_trex_binary_extraction() { assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm-32_binutils-2.34-O0_elfedit_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm-32_binutils-2.34-O0_elfedit_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json") ), - "elfedit" + PathBuf::from("elfedit") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm-32_binutils-2.34-O0_objdump_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm-32_binutils-2.34-O0_objdump_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json") ), - "objdump" + PathBuf::from("objdump") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm-32_binutils-2.34-O0_nm-new_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm-32_binutils-2.34-O0_nm-new_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json") ), - "nm-new" + PathBuf::from("nm-new") ); // __ for c++ bins that sometimes crop up assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm-32_binutils-2.34-O0_nm-new_cg-onehopcgcallers-meta/sym.dummy___func__-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm-32_binutils-2.34-O0_nm-new_cg-onehopcgcallers-meta/sym.dummy___func__-onehopcgcallers-meta.json") ), - "nm-new" + PathBuf::from("nm-new") ); assert_eq!( - crate::dedup::CGCorpus::get_binary_name_binkit(&"fast-disk/Dataset-2/cgs/x86-32_coreutils-8.32-O1_stat_cg-onehopcgcallers-meta/main-onehopcgcallers-meta.json".to_string()), - "stat" + crate::dedup::CGCorpus::get_binary_name_binkit(&PathBuf::from("fast-disk/Dataset-2/cgs/x86-32_coreutils-8.32-O1_stat_cg-onehopcgcallers-meta/main-onehopcgcallers-meta.json")), + PathBuf::from("stat") ); - assert_eq!(crate::dedup::CGCorpus::get_binary_name_binkit(&"/fast-disk/processed_datasets/Dataset-2/arm-32_binutils-2.34-O0_addr2line_cg-onehopcgcallers-meta/sym.adjust_relative_path-onehopcgcallers-meta.json".to_string()), - "addr2line") + assert_eq!(crate::dedup::CGCorpus::get_binary_name_binkit(&PathBuf::from("/fast-disk/processed_datasets/Dataset-2/arm-32_binutils-2.34-O0_addr2line_cg-onehopcgcallers-meta/sym.adjust_relative_path-onehopcgcallers-meta.json")), + PathBuf::from("addr2line")) } } diff --git a/src/extract.rs b/src/extract.rs index 8d01bd3..2fafed0 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -1,18 +1,25 @@ use crate::afij::AFIJFunctionInfo; -use crate::agcj::AGCJFunctionCallGraphs; +use crate::agcj::AGCJFunctionCallGraph; + +use anyhow::anyhow; use anyhow::bail; use anyhow::Error; use anyhow::Result; use r2pipe::R2Pipe; use r2pipe::R2PipeSpawnOptions; + use serde::{Deserialize, Serialize}; use serde_aux::prelude::*; use serde_json; + use serde_json::{json, Value}; use std::collections::HashMap; +use std::env; + use std::fs; use std::fs::File; -use std::path::Path; + +use std::path::{Path, PathBuf}; use walkdir::WalkDir; #[derive(PartialEq, Debug)] @@ -35,25 +42,33 @@ pub enum ExtractionJobType { #[derive(Debug)] pub struct FileToBeProcessed { - pub file_path: String, - pub output_path: String, + pub file_path: PathBuf, + pub output_path: PathBuf, pub job_type_suffix: String, + pub r2p_config: R2PipeConfig, } #[derive(Debug)] pub struct ExtractionJob { - pub input_path: String, + pub input_path: PathBuf, pub input_path_type: PathType, pub job_type: ExtractionJobType, pub files_to_be_processed: Vec, - pub output_path: String, // Remove - Kept for backwards compat + pub output_path: PathBuf, // Remove - Kept for backwards compat +} + +#[derive(Debug, Clone, Copy)] +pub struct R2PipeConfig { + pub debug: bool, + pub extended_analysis: bool, + pub use_curl_pdb: bool, } impl std::fmt::Display for ExtractionJob { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, - "bin_path: {} p_type: {:?} what_do: {:?}", + "bin_path: {:?} p_type: {:?} what_do: {:?}", self.input_path, self.input_path_type, self.job_type ) } @@ -159,7 +174,6 @@ pub struct Codexref { // Structs related to AEAFJ #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] pub struct AEAFJRegisterBehaviour { #[serde(rename = "A")] pub a: Vec, @@ -200,29 +214,35 @@ impl std::fmt::Display for AFLJFuncDetails { } } -impl From<(String, String, String)> for FileToBeProcessed { - fn from(orig: (String, String, String)) -> FileToBeProcessed { +impl From<(String, String, String, R2PipeConfig)> for FileToBeProcessed { + fn from(orig: (String, String, String, R2PipeConfig)) -> FileToBeProcessed { FileToBeProcessed { - file_path: orig.0, - output_path: orig.1, + file_path: PathBuf::from(orig.0), + output_path: PathBuf::from(orig.1), job_type_suffix: orig.2, + r2p_config: orig.3, } } } impl ExtractionJob { - pub fn new(input_path: &str, output_path: &str, mode: &str) -> Result { - fn get_path_type(bin_path: &str) -> PathType { + pub fn new( + input_path: &PathBuf, + output_path: &PathBuf, + mode: &str, + debug: &bool, + extended_analysis: &bool, + use_curl_pdb: &bool, + ) -> Result { + fn get_path_type(bin_path: &PathBuf) -> PathType { let fpath_md = fs::metadata(bin_path).unwrap(); - let p_type: PathType; if fpath_md.is_file() { - p_type = PathType::File; + PathType::File } else if fpath_md.is_dir() { - p_type = PathType::Dir; + PathType::Dir } else { - p_type = PathType::Unk; + PathType::Unk } - p_type } // This functionality is currently not being used! @@ -239,45 +259,60 @@ impl ExtractionJob { } } + let r2_handle_config = R2PipeConfig { + debug: *debug, + extended_analysis: *extended_analysis, + use_curl_pdb: *use_curl_pdb, + }; + let p_type = get_path_type(input_path); let job_type = extraction_job_matcher(mode).unwrap(); if p_type == PathType::File { let file = FileToBeProcessed { - file_path: input_path.to_string(), - output_path: output_path.to_string(), + file_path: input_path.to_owned(), + output_path: output_path.to_owned(), job_type_suffix: (*mode).to_string(), + r2p_config: r2_handle_config, }; Ok(ExtractionJob { - input_path: input_path.to_string(), + input_path: input_path.to_owned(), input_path_type: p_type, job_type, files_to_be_processed: vec![file], - output_path: (*output_path).to_string(), + output_path: output_path.to_owned(), }) } else if p_type == PathType::Dir { - let files = ExtractionJob::get_file_paths_dir(input_path.to_string()); - let files_with_output_path: Vec<(String, String, String)> = files + let files = ExtractionJob::get_file_paths_dir(input_path); + + let files_with_output_path: Vec<(String, String, String, R2PipeConfig)> = files .into_iter() - .map(|f| (f, output_path.to_string(), mode.to_string())) + .map(|f| { + ( + f, + output_path.to_string_lossy().to_string(), + mode.to_string(), + r2_handle_config, + ) + }) .collect(); let files_to_be_processed: Vec = files_with_output_path .into_iter() .map(FileToBeProcessed::from) .collect(); Ok(ExtractionJob { - input_path: input_path.to_string(), + input_path: input_path.to_owned(), input_path_type: p_type, job_type, files_to_be_processed, - output_path: output_path.to_string(), + output_path: output_path.to_owned(), }) } else { bail!("Failed to create extraction job.") } } - fn get_file_paths_dir(input_path: String) -> Vec { + fn get_file_paths_dir(input_path: &PathBuf) -> Vec { let mut str_vec: Vec = Vec::new(); for file in WalkDir::new(input_path) .into_iter() @@ -296,49 +331,54 @@ impl ExtractionJob { } impl FileToBeProcessed { - pub fn extract_register_behaviour(&self, debug: &bool) { + pub fn extract_register_behaviour(&self) { info!("Starting register behaviour extraction"); - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); + let mut r2p = self.setup_r2_pipe(); let function_details = self.get_function_name_list(&mut r2p); - let mut register_behaviour_vec: HashMap = HashMap::new(); - info!("Executing aeafj for each function"); - for function in function_details.iter() { - r2p.cmd(format!("s @ {}", &function.name).as_str()) - .expect("Command failed.."); - let json = r2p.cmd("aeafj").expect("Command failed.."); - let json_obj: AEAFJRegisterBehaviour = - serde_json::from_str(&json).expect("Unable to convert to JSON object!"); - register_behaviour_vec.insert(function.name.clone(), json_obj); - } - info!("All functions processed"); - r2p.close(); - info!("r2p closed"); + if function_details.is_ok() { + let mut register_behaviour_vec: HashMap = + HashMap::new(); + info!("Executing aeafj for each function"); + for function in function_details.unwrap().iter() { + r2p.cmd(format!("s @ {}", &function.name).as_str()) + .expect("Command failed.."); + let json = r2p.cmd("aeafj").expect("Command failed.."); + let json_obj: AEAFJRegisterBehaviour = + serde_json::from_str(&json).expect("Unable to convert to JSON object!"); + register_behaviour_vec.insert(function.name.clone(), json_obj); + } + info!("All functions processed"); + r2p.close(); + info!("r2p closed"); - info!("Writing extracted data to file"); - self.write_to_json(&json!(register_behaviour_vec)) + info!("Writing extracted data to file"); + self.write_to_json(&json!(register_behaviour_vec)) + } else { + error!( + "Failed to extract function details to generate register behaviour - Error in r2 extraction for {:?}", + self.file_path + ) + } } - // TODO: Refactor this so it uses the AGFJ struct - pub fn extract_func_cfgs(&self, debug: &bool) { + pub fn extract_func_cfgs(&self) { let mut fp_filename = Path::new(&self.file_path) .file_name() .expect("Unable to get filename") .to_string_lossy() .to_string(); fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); - let f_name = format!("{}/{}.json", &self.output_path, fp_filename); + let f_name = format!("{:?}/{}.json", &self.output_path, fp_filename); if !Path::new(&f_name).exists() { info!("{} not found. Continuing processing.", f_name); - // This creates HUGE JSON files for each files - // Approximately 40x file size to JSON - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); - info!("Executing agfj @@f on {}", self.file_path); - let mut json = r2p.cmd("agfj @@f").expect("Command failed.."); - - info!("Closing r2p process for {}", self.file_path); + let mut r2p = self.setup_r2_pipe(); + info!("Executing agfj @@f on {:?}", self.file_path); + let mut json = r2p + .cmd("agfj @@f") + .expect("Failed to extract control flow graph information."); + info!("Closing r2p process for {:?}", self.file_path); r2p.close(); - - info!("Starting JSON fixup for {}", self.file_path); + info!("Starting JSON fixup for {:?}", self.file_path); // Fix JSON object json = json.replace("[]\n", ","); json = json.replace("}]\n[{", "}],\n[{"); @@ -347,7 +387,7 @@ impl FileToBeProcessed { json = json.replace("}]\n,]", "}]\n]"); json = json.replace("\n,,[{", "\n,[{"); json = json.replace("\n,,[{", "\n,[{"); - info!("JSON fixup finished for {}", self.file_path); + info!("JSON fixup finished for {:?}", self.file_path); if json != "[,]" { #[allow(clippy::expect_fun_call)] @@ -369,11 +409,11 @@ impl FileToBeProcessed { } } - pub fn extract_function_call_graphs(&self, debug: &bool) { + pub fn extract_function_call_graphs(&self) { info!("Starting function call graph extraction"); - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); + let mut r2p = self.setup_r2_pipe(); let json = r2p.cmd("agCj").expect("agCj command failed to execute"); - let function_call_graphs: Vec = + let function_call_graphs: Vec = serde_json::from_str(&json).expect("Unable to convert to JSON object!"); info!("Function call graph extracted."); r2p.close(); @@ -383,67 +423,77 @@ impl FileToBeProcessed { self.write_to_json(&json!(function_call_graphs)) } - pub fn extract_function_xrefs(&self, debug: &bool) { - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); + pub fn extract_function_xrefs(&self) { + let mut r2p = self.setup_r2_pipe(); let function_details = self.get_function_name_list(&mut r2p); let mut function_xrefs: HashMap> = HashMap::new(); info!("Extracting xrefs for each function"); - for function in function_details.iter() { - let ret = self.get_function_xref_details(function.offset, &mut r2p); - function_xrefs.insert(function.name.clone(), ret); - } - info!("All functions processed"); - r2p.close(); - info!("r2p closed"); + if function_details.is_ok() { + for function in function_details.unwrap().iter() { + let ret = self.get_function_xref_details(function.offset, &mut r2p); + function_xrefs.insert(function.name.clone(), ret); + } + info!("All functions processed"); + r2p.close(); + info!("r2p closed"); - info!("Writing extracted data to file"); - self.write_to_json(&json!(function_xrefs)) + info!("Writing extracted data to file"); + self.write_to_json(&json!(function_xrefs)) + } else { + error!( + "Failed to extract function xrefs - Error in r2 extraction for {:?}", + self.file_path + ) + } } - pub fn extract_function_info(&self, debug: &bool) { + pub fn extract_function_info(&self) { info!("Starting function metdata extraction"); - let mut fp_filename = Path::new(self.file_path.as_str()) + let mut fp_filename = self + .file_path .file_name() .expect("Unable to get filename") .to_string_lossy() .to_string(); fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); - let f_name = format!("{}/{}.json", self.output_path, fp_filename); + let f_name = format!("{:?}/{}.json", self.output_path, fp_filename); if !Path::new(&f_name).exists() { - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); - let function_details = self.get_function_name_list(&mut r2p); - let mut function_info: Vec> = Vec::new(); - info!("Extracting function metadata"); - for function in function_details.iter() { - debug!("Processing {}", function.name); - let ret = self.get_function_info(function.offset, &mut r2p); - debug!("Metadata Collected: {:?}", ret); - function_info.push(ret); - } - info!("All functions processed"); - r2p.close(); - info!("r2p closed"); + let mut r2p = self.setup_r2_pipe(); - info!("Writing extracted data to file"); - self.write_to_json(&json!(function_info - .into_iter() - .flatten() - .collect::>())) + let function_details: Result, r2pipe::Error> = + self.get_function_name_list(&mut r2p); + + if function_details.is_err() { + error!("Unable to extract function info for {:?}", self.file_path); + r2p.close(); + info!("r2p closed"); + } else { + r2p.close(); + info!("r2p closed"); + + info!("Writing extracted data to file"); + self.write_to_json(&json!(function_details.unwrap())) + } } } // r2 commands to structs - fn get_function_name_list(&self, r2p: &mut R2Pipe) -> Vec { + fn get_function_name_list( + &self, + r2p: &mut R2Pipe, + ) -> Result, r2pipe::Error> { info!("Getting function information from binary"); - let json = r2p - .cmd("aflj") - .expect(&format!("aflj command failed for {}", self.file_path)); - let json_obj: Vec = serde_json::from_str(&json) - .expect(&format!("Unable to convert to JSON object! - {}", json)); + let json = r2p.cmd("aflj"); - json_obj + if let Ok(json_str) = json { + let json_obj: Vec = serde_json::from_str(json_str.as_ref()) + .expect("Unable to convert to JSON object!"); + Ok(json_obj) + } else { + Err(json.unwrap_err()) + } } fn get_function_xref_details( @@ -473,30 +523,27 @@ impl FileToBeProcessed { json_obj } - fn get_function_info(&self, function_addr: u64, r2p: &mut R2Pipe) -> Vec { - Self::go_to_address(r2p, function_addr); - let json = r2p.cmd("afij").expect("afij command failed"); - let json_obj: Vec = serde_json::from_str(&json) - .expect(&format!("Unable to convert to JSON object! - {}", json)); - json_obj - } - // Helper Functions - fn write_to_json(&self, json_obj: &Value) { - let mut fp_filename = Path::new(self.file_path.as_str()) + let mut fp_filename = self + .file_path .file_name() .expect("Unable to get filename") .to_string_lossy() .to_string(); - fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); - let f_name = format!("{}/{}.json", self.output_path, fp_filename); + fp_filename = fp_filename + "_" + &self.job_type_suffix.clone() + ".json"; + + let mut output_filepath = PathBuf::new(); + output_filepath.push(self.output_path.clone()); + output_filepath.push(fp_filename); + debug!("Save filename: {:?}", output_filepath); + serde_json::to_writer( - &File::create(&f_name).expect("Unable to create file!"), + &File::create(&output_filepath).expect("Unable to create file!"), &json_obj, ) - .unwrap_or_else(|_| panic!("the world is ending: {}", f_name)); + .unwrap_or_else(|_| panic!("the world is ending: {:?}", output_filepath)); } fn go_to_address(r2p: &mut R2Pipe, function_addr: u64) { @@ -504,33 +551,80 @@ impl FileToBeProcessed { .expect("failed to seek addr"); } - fn setup_r2_pipe(&self, s: &String, debug: &bool) -> R2Pipe { - // Setup R2 pipe with options and return it - // Could be extended to include toggling of options - // + more args? - let opts = if !(*debug) { - debug!("Creating r2 handle without debugging"); - R2PipeSpawnOptions { - exepath: "r2".to_owned(), - args: vec!["-e bin.cache=true", "-e log.level=1", "-2"], - } + fn handle_symbols_pdb(&self, r2p: &mut R2Pipe) -> Result<(), Error> { + // Download symbols if available + debug!("Downloading pdb file for {:?}", self.file_path); + let download_pdb = r2p.cmd("idpd"); + + debug!("Download PDB Ret: {:?}", download_pdb); + + if download_pdb.unwrap().contains("success") { + let ret = r2p.cmd("idp"); + debug!("Return value: {:?}", ret); + + Ok(()) } else { + Err(anyhow!("Unable to download pdb")) + } + } + + fn setup_r2_pipe(&self) -> R2Pipe { + if self.r2p_config.use_curl_pdb { + // Docs suggest this is unsafe + env::set_var("R2_CURL", "1"); + } + + let opts = if self.r2p_config.debug { debug!("Creating r2 handle with debugging"); R2PipeSpawnOptions { - exepath: "r2".to_owned(), + exepath: "radare2".to_owned(), args: vec!["-e bin.cache=true", "-e log.level=0"], } + } else { + debug!("Creating r2 handle without debugging"); + R2PipeSpawnOptions { + exepath: "radare2".to_owned(), + args: vec!["-e bin.cache=true", "-e log.level=1", "-2"], + } }; - debug!("Attempting to create r2pipe using {}", s); + + debug!("Attempting to create r2pipe using {:?}", self.file_path); let mut r2p = match R2Pipe::in_session() { Some(_) => R2Pipe::open().expect("Unable to open R2Pipe"), - None => R2Pipe::spawn(s, Some(opts)).expect("Failed to spawn new R2Pipe"), + None => R2Pipe::spawn(self.file_path.to_str().unwrap(), Some(opts)) + .expect("Failed to spawn new R2Pipe"), }; - debug!("Executing 'aa' r2 command for {}", s); - r2p.cmd("aa") - .expect("Unable to complete standard analysis!"); - debug!("'aa' r2 command complete for {}", s); + let info = r2p.cmdj("ij"); + if info.is_ok() { + let info = info.unwrap(); + if info["bin"]["bintype"].as_str().unwrap() == "pe" { + debug!("PE file found. Handling symbol download!"); + let ret = self.handle_symbols_pdb(&mut r2p); + + if ret.is_err() { + error!("Unable to get PDB info") + } + } + } + + if self.r2p_config.extended_analysis { + debug!( + "Executing 'aaa' r2 command for {}", + self.file_path.display() + ); + r2p.cmd("aaa") + .expect("Unable to complete standard analysis!"); + debug!("'aaa' r2 command complete for {}", self.file_path.display()); + } else { + debug!("Executing 'aa' r2 command for {}", self.file_path.display()); + r2p.cmd("aa") + .expect("Unable to complete standard analysis!"); + debug!( + "'aa' r2 command complete for {:?}", + self.file_path.display() + ); + }; r2p } } diff --git a/src/files.rs b/src/files.rs index 08a61b2..1cdc209 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1,14 +1,21 @@ -use crate::afij::{AFIJFeatureSubset, AFIJFunctionInfo}; -use crate::agcj::AGCJFunctionCallGraphs; +use crate::afij::{AFIJFeatureSubset, AFIJFeatureSubsetExtended, AFIJFunctionInfo}; +use crate::agcj::AGCJFunctionCallGraph; use crate::agfj::{AGFJFunc, TikNibFunc}; use crate::bb::{FeatureType, InstructionMode}; use crate::consts::*; use crate::errors::FileLoadError; #[cfg(feature = "inference")] use crate::inference::InferenceJob; +use crate::networkx::{ + CallGraphFuncWithMetadata, CallGraphTikNibFeatures, CallGraphTypes, NetworkxDiGraph, +}; use crate::utils::get_save_file_path; use enum_as_inner::EnumAsInner; use indicatif::ParallelProgressIterator; +use itertools::Itertools; + +use crate::DataType; +use petgraph::{Graph, Incoming, Outgoing}; use rayon::iter::ParallelIterator; use rayon::prelude::{IntoParallelRefIterator, IntoParallelRefMutIterator}; use serde::{Deserialize, Serialize}; @@ -16,17 +23,19 @@ use serde_json::json; use std::collections::HashMap; use std::fs::{read_to_string, File}; use std::io::{BufWriter, Write}; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::string::String; use std::sync::mpsc::channel; #[cfg(feature = "inference")] use std::sync::Arc; +#[cfg(feature = "inference")] +use tch::nn::func; #[derive(Serialize, Deserialize, Debug)] pub struct AGFJFile { - pub filename: String, + pub filename: PathBuf, pub functions: Option>>, - pub output_path: String, + pub output_path: PathBuf, pub min_blocks: u16, pub feature_type: Option, pub architecture: Option, @@ -51,18 +60,18 @@ impl AGFJFile { pub fn load_and_deserialize(&mut self) -> Result<(), ()> { let data = read_to_string(&self.filename).expect("Unable to read file"); - #[allow(clippy::expect_fun_call)] // Kept in to ensure that the JSON decode error message is printed alongside the filename - let json: Vec> = serde_json::from_str(&data).expect(&format!( - "Unable to load function data from {}", - self.filename - )); + let json = serde_json::from_str(&data); - self.functions = Some(json); + if json.is_ok() { + self.functions = Some(json.unwrap()); - self.architecture = self.detect_architecture(); + self.architecture = self.detect_architecture(); - Ok(()) + Ok(()) + } else { + Err(()) + } } /// Detects the architecture of a file by iterating through the functions @@ -139,11 +148,12 @@ impl AGFJFile { /// It is *not* suitable for doing any other sort of tasks such as Next Sentence /// Prediction (NSP) as there is not indication of where a basic block starts or ends. pub fn generate_random_bb_walk(mut self, esil: bool, pairs: bool) { - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); let fname_string = if esil { - format!("{}-esil-singles-rwdfs.txt", fname_string) + format!("{:?}-esil-singles-rwdfs.txt", fname_string) } else { - format!("{}-dis-singles-rwdfs.txt", fname_string) + format!("{:?}-dis-singles-rwdfs.txt", fname_string) }; if !Path::new(&fname_string).exists() { @@ -189,8 +199,9 @@ impl AGFJFile { /// Generates a single string which contains the ESIL representation of every /// instruction within a function pub fn generate_esil_func_strings(mut self) { - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); - let fname_string = format!("{}-efs.json", fname_string); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); + let fname_string = format!("{:?}-efs.json", fname_string); if !Path::new(&fname_string).exists() { self.load_and_deserialize() @@ -229,8 +240,9 @@ impl AGFJFile { pub fn generate_disasm_func_strings(mut self) { // This needs to be amended so that there is a AGFJFunc function // that returns a function as a func string. - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); - let fname_string = format!("{}-dfs.json", fname_string); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); + let fname_string = format!("{:?}-dfs.json", fname_string); if !Path::new(&fname_string).exists() { self.load_and_deserialize() @@ -254,9 +266,9 @@ impl AGFJFile { let map: HashMap<_, _> = fixed.into_iter().collect(); let json = json!(map); - let fname_string: String = - get_save_file_path(&self.filename, &self.output_path, None); - let fname_string = format!("{}-dfs.json", fname_string); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); + let fname_string = format!("{:?}-dfs.json", fname_string); serde_json::to_writer( &File::create(fname_string).expect("Failed to create writer"), @@ -273,11 +285,12 @@ impl AGFJFile { /// This ignores control flow and simple iterates the JSON objects from the top to /// the bottom. pub fn generate_linear_bb_walk(mut self, esil: bool) { - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); let fname_string = if esil { - format!("{}-esil-singles.txt", fname_string) + format!("{:?}-esil-singles.txt", fname_string) } else { - format!("{}-dis-singles.txt", fname_string) + format!("{:?}-dis-singles.txt", fname_string) }; if !Path::new(&fname_string).exists() { @@ -335,8 +348,9 @@ impl AGFJFile { } let json = json!(&func_feature_vectors); - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); - let fname_string = format!("{}-tiknib.json", fname_string); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); + let fname_string = format!("{}-tiknib.json", fname_string.to_string_lossy()); serde_json::to_writer( &File::create(fname_string).expect("Failed to create writer"), &json, @@ -371,14 +385,15 @@ impl AGFJFile { #[serde(untagged)] pub enum FunctionMetadataTypes { AFIJ(Vec), + AFIJExtended(Vec), AGFJ(Vec), } #[derive(Serialize, Deserialize, Debug)] pub struct AGCJFile { - pub filename: String, - pub function_call_graphs: Option>, - pub output_path: String, + pub filename: PathBuf, + pub function_call_graphs: Option>, + pub output_path: PathBuf, pub function_metadata: Option, pub include_unk: bool, } @@ -389,18 +404,222 @@ impl AGCJFile { #[allow(clippy::expect_fun_call)] // Kept in to ensure that the JSON decode error message is printed alongside the filename - let json: Vec = serde_json::from_str(&data)?; + let json: Vec = serde_json::from_str(&data)?; self.function_call_graphs = Some(json); Ok(()) } + + // Global Call Graph Related Functions + pub fn generate_global_call_graphs(&mut self, metadata_type: Option) { + let call_graph = self.build_global_call_graph(); + debug!("Num Nodes (Default): {}", call_graph.node_count()); + let cleaned_graph = self.post_process_graph(call_graph); + debug!("Num Nodes (Post-Clean): {}", cleaned_graph.node_count()); + self.save_global_call_graph_to_json(cleaned_graph, metadata_type) + } + + fn build_global_call_graph(&mut self) -> Graph { + if self.function_call_graphs.is_none() { + let ret = self.load_and_deserialize(); + if ret.is_err() { + error!("Unable to load target data file - No functions to process.") + } + } + + let mut graph = Graph::::new(); + + for function in self.function_call_graphs.as_ref().unwrap().iter() { + let function_index_find = graph.node_indices().find(|i| graph[*i] == function.name); + + let function_index = if let Some(index) = function_index_find { + index + } else { + graph.add_node(function.name.clone()) + }; + + debug!( + "Function Index Find: {:?} Function Index Used: {:?}", + function_index_find, function_index + ); + + if function.imports.is_some() { + for import in function.imports.as_ref().unwrap().iter() { + if !self.include_unk && import.starts_with("unk.") { + debug!("Skipping {}", import); + continue; + } else { + let import_index_find = graph.node_indices().find(|i| &graph[*i] == import); + let import_index = if let Some(index) = import_index_find { + index + } else { + graph.add_node(import.clone()) + }; + + graph.update_edge(function_index, import_index, 0); + } + } + } + } + graph + } + + fn post_process_graph(&self, mut graph: Graph) -> Graph { + // Tidy up the generated call graph to account for when + // calling relationships may have not been recovered and + // we have orphan nodes + for node_idx in graph.node_indices() { + if graph + .neighbors_directed(node_idx, Outgoing) + .collect_vec() + .len() + + graph + .neighbors_directed(node_idx, Incoming) + .collect_vec() + .len() + == 0 + { + graph.remove_node(node_idx); + } + } + graph + } + + fn add_node_features_to_global_call_graph( + &self, + graph: Graph, + metadata_type: Option, + ) -> CallGraphTypes { + match metadata_type.unwrap().as_str() { + "finfo" => { + let networkx_graph = NetworkxDiGraph::::from(( + graph, + self.function_metadata.as_ref().unwrap().as_afij().unwrap(), + )); + CallGraphTypes::CGMeta(networkx_graph) + } + "tiknib" => { + let networkx_graph = NetworkxDiGraph::::from(( + graph, + self.function_metadata.as_ref().unwrap().as_agfj().unwrap(), + )); + CallGraphTypes::TikNib(networkx_graph) + } + _ => unreachable!("Impossible :D"), + } + } + fn save_global_call_graph_to_json( + &self, + graph: Graph, + metadata_type: Option, + ) { + let networkx_graph = if metadata_type.is_some() { + self.add_node_features_to_global_call_graph(graph, metadata_type) + } else { + CallGraphTypes::CGName(NetworkxDiGraph::from(graph)) + }; + + let mut full_output_path = get_save_file_path( + &self.filename, + &self.output_path, + Some("gcg".to_string()), + Some("_cg".to_string()), + ); + + full_output_path.set_extension("json"); + + debug!( + "Attempting to save global call graph to: {:?}", + full_output_path + ); + + serde_json::to_writer( + &File::create(full_output_path).expect("Failed to create writer"), + &networkx_graph, + ) + .expect("Unable to write JSON"); + } + + // Local Call Graph Helper Functions + fn process_function_level_cg( + &self, + graph_data_type: DataType, + with_features: &bool, + metadata_type: Option, + ) { + for fcg in self.function_call_graphs.as_ref().unwrap() { + match graph_data_type { + DataType::Cg => { + fcg.to_petgraph( + self, + &self.output_path, + &self.filename, + with_features, + &self.include_unk, + metadata_type.clone(), + ); + } + DataType::OneHopCg => { + fcg.one_hop_to_petgraph( + self, + &self.output_path, + &self.filename, + with_features, + &self.include_unk, + metadata_type.clone(), + ); + } + DataType::CgWithCallers => { + fcg.to_petgraph_with_callers( + self, + &self.output_path, + &self.filename, + with_features, + &self.include_unk, + metadata_type.clone(), + ); + } + DataType::OneHopCgWithcallers => { + fcg.one_hop_to_petgraph_with_callers( + self, + &self.output_path, + &self.filename, + with_features, + &self.include_unk, + metadata_type.clone(), + ); + } + _ => unreachable!("Not possible hopefully! :O"), + } + } + } + + pub fn process_based_on_graph_data_type( + &mut self, + graph_data_type: DataType, + with_features: &bool, + metadata_type: Option, + ) { + match graph_data_type { + DataType::GlobalCg => self.generate_global_call_graphs(metadata_type.clone()), + DataType::Cg + | DataType::OneHopCg + | DataType::OneHopCgWithcallers + | DataType::CgWithCallers => self.process_function_level_cg( + graph_data_type, + with_features, + metadata_type.clone(), + ), + _ => unreachable!("Unreachable!"), + } + } } #[derive(Serialize, Deserialize, Debug)] pub struct AFIJFile { - pub filename: String, + pub filename: PathBuf, pub function_info: Option>, - pub output_path: String, + pub output_path: PathBuf, } impl AFIJFile { @@ -415,19 +634,30 @@ impl AFIJFile { Ok(()) } - pub fn subset(&mut self) -> FunctionMetadataTypes { - let mut func_info_subsets: Vec = Vec::new(); - debug!("Starting to subset functions"); - for function in self.function_info.as_ref().unwrap().iter() { - let subset = AFIJFeatureSubset::from(function); - func_info_subsets.push(subset) + pub fn subset(&mut self, extended: bool) -> FunctionMetadataTypes { + if extended { + let mut func_info_subsets_extended: Vec = Vec::new(); + debug!("Starting to subset functions"); + for function in self.function_info.as_ref().unwrap().iter() { + let subset = AFIJFeatureSubsetExtended::from(function); + func_info_subsets_extended.push(subset) + } + FunctionMetadataTypes::AFIJExtended(func_info_subsets_extended) + } else { + let mut func_info_subsets: Vec = Vec::new(); + debug!("Starting to subset functions"); + for function in self.function_info.as_ref().unwrap().iter() { + let subset = AFIJFeatureSubset::from(function); + func_info_subsets.push(subset) + } + FunctionMetadataTypes::AFIJ(func_info_subsets) } - FunctionMetadataTypes::AFIJ(func_info_subsets) } - pub fn subset_and_save(&mut self) { - let func_info_subsets = self.subset(); - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); - let filename = format!("{}-finfo-subset.json", fname_string); + pub fn subset_and_save(&mut self, extended: bool) { + let func_info_subsets = self.subset(extended); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); + let filename = format!("{}-finfo-subset.json", fname_string.to_string_lossy()); serde_json::to_writer( &File::create(filename).expect("Failed to create writer"), &func_info_subsets, @@ -438,9 +668,9 @@ impl AFIJFile { #[derive(Serialize, Deserialize, Debug)] pub struct TikNibFuncMetaFile { - pub filename: String, + pub filename: PathBuf, pub function_info: Option>, - pub output_path: String, + pub output_path: PathBuf, } impl TikNibFuncMetaFile { @@ -459,3 +689,77 @@ impl TikNibFuncMetaFile { FunctionMetadataTypes::AGFJ(self.function_info.clone().unwrap()) } } + +#[cfg(test)] +mod tests { + use crate::files::AGCJFile; + use std::collections::HashSet; + use std::path::PathBuf; + + fn return_test_file_oject(file_path: &str) -> AGCJFile { + let mut call_graph_file = AGCJFile { + filename: PathBuf::from(file_path), + function_call_graphs: None, + output_path: PathBuf::new(), + function_metadata: None, + include_unk: false, + }; + + call_graph_file + .load_and_deserialize() + .expect("Failed to load data"); + call_graph_file + } + + #[test] + fn test_global_call_graph_generation() { + let mut call_graph_file = return_test_file_oject("test-files/ls_cg.json"); + + let global_call_graph = call_graph_file.build_global_call_graph(); + + assert_eq!(global_call_graph.node_count(), 111); + + let mut node_names = Vec::new(); + + for node in global_call_graph.raw_nodes().iter() { + node_names.push(node.weight.clone()) + } + + let unique_node_names = node_names.iter().collect::>(); + + assert_eq!(node_names.len(), unique_node_names.len()); + } + + #[test] + fn test_global_graph_with_redudent_nodes() { + let mut call_graph_file = return_test_file_oject("data-examples/raw/test_bin_cg.json"); + + let global_call_graph = call_graph_file.build_global_call_graph(); + + assert_eq!(global_call_graph.node_count(), 9); + + let mut node_names = Vec::new(); + + for node in global_call_graph.raw_nodes().iter() { + node_names.push(node.weight.clone()) + } + + let unique_node_names = node_names.iter().collect::>(); + + assert_eq!(node_names.len(), unique_node_names.len()); + + let post_processed_call_graph = call_graph_file.post_process_graph(global_call_graph); + + assert_eq!(post_processed_call_graph.node_count(), 8); + + let mut node_names = Vec::new(); + + for node in post_processed_call_graph.raw_nodes().iter() { + node_names.push(node.weight.clone()) + } + + let unique_node_names = node_names.iter().collect::>(); + + assert_eq!(node_names.len(), unique_node_names.len()); + } +} diff --git a/src/main.rs b/src/main.rs index 35a4cc8..2cd86cc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,6 +22,7 @@ pub mod agfj; pub mod bb; #[cfg(feature = "goblin")] pub mod binnfo; +mod combos; pub mod consts; pub mod dedup; pub mod errors; @@ -57,13 +58,14 @@ use utils::get_json_paths_from_dir; #[global_allocator] static GLOBAL: MiMalloc = MiMalloc; -#[derive(PartialEq)] -enum DataType { +#[derive(PartialEq, Copy, Clone)] +pub enum DataType { Cfg, Cg, OneHopCg, CgWithCallers, OneHopCgWithcallers, + GlobalCg, Invalid, } @@ -75,6 +77,7 @@ impl fmt::Display for DataType { DataType::CgWithCallers => write!(f, "Call Graph with Callers"), DataType::OneHopCg => write!(f, "One Hop Call Graph"), DataType::OneHopCgWithcallers => write!(f, "One Hop Call Graph with Callers"), + DataType::GlobalCg => write!(f, "Globlal Call Graph"), DataType::Invalid => write!(f, "Invalid"), } } @@ -93,16 +96,16 @@ enum GenerateSubCommands { Graphs { /// The path to a JSON file extracted using the command #[arg(short, long, value_name = "FILENAME")] - path: String, + path: PathBuf, /// The target data type - #[arg(short, long, value_name = "DATA_TYPE", value_parser = clap::builder::PossibleValuesParser::new(["cfg", "cg", "onehopcg", "cgcallers", "onehopcgcallers"]) + #[arg(short, long, value_name = "DATA_TYPE", value_parser = clap::builder::PossibleValuesParser::new(["cfg", "cg", "onehopcg", "cgcallers", "onehopcgcallers", "globalcg"]) .map(|s| s.parse::().unwrap()),)] data_type: String, /// The output path for the processed Networkx graphs (1 per function) #[arg(short, long, value_name = "OUTPUT")] - output_path: String, + output_path: PathBuf, /// The type of features to generate per basic block (node) #[arg(short, long, value_name = "FEATURE_TYPE", value_parser = clap::builder::PossibleValuesParser::new(["gemini", "discovre", "dgis"]) @@ -143,7 +146,7 @@ enum GenerateSubCommands { /// Filepath to the AFIJ function metadata (For call graphs) #[arg(long)] - metadata_path: Option, + metadata_path: Option, /// Include unknown functions (For call graphs) #[arg(long, default_value = "false")] @@ -158,7 +161,7 @@ enum GenerateSubCommands { Nlp { /// The path to a JSON file extracted using the command #[arg(short, long, value_name = "FILENAME")] - path: String, + path: PathBuf, /// The type of data to be generated #[arg(short, long, value_name = "DATA_TYPE", value_parser = clap::builder::PossibleValuesParser::new(["esil", "disasm"]) @@ -171,7 +174,7 @@ enum GenerateSubCommands { /// The output path for the processed data #[arg(short, long, value_name = "OUTPUT_PATH")] - data_out_path: String, + data_out_path: PathBuf, /// The format of the output data #[arg(short, long, value_name = "FORMAT", value_parser = clap::builder::PossibleValuesParser::new(["single", "funcstring"]) @@ -194,14 +197,17 @@ enum GenerateSubCommands { Metadata { /// The path to an afji JSON file extracted using the command #[arg(short, long, value_name = "INPUT_PATH")] - input_path: String, + input_path: PathBuf, /// The path for the generated output #[arg(short, long, value_name = "OUTPUT_PATH")] - output_path: String, + output_path: PathBuf, /// Data Source Type #[arg(short, long, value_parser = clap::builder::PossibleValuesParser::new(["finfo", "agfj"]) .map(|s| s.parse::().unwrap()))] data_source_type: String, + /// Toggle for extended version of finfo + #[arg(short, long)] + extended: bool, }, /// Generate tokenisers from extracted data Tokeniser { @@ -222,6 +228,18 @@ enum GenerateSubCommands { #[arg(short, long, value_name = "BPE or Byte-BPE", default_value = "BPE")] tokeniser_type: String, }, + /// Generate combinations of extracted data - Primaryily metadata objects + Combos { + #[arg(short, long, value_name = "INPUT_PATH")] + input_path: PathBuf, + /// The path for the generated output + #[arg(short, long, value_name = "OUTPUT_PATH")] + output_path: PathBuf, + /// Combo Type + #[arg(short, long, value_parser = clap::builder::PossibleValuesParser::new(["finfo+tiknib", "finfoe+tiknib"]) + .map(|s| s.parse::().unwrap()))] + combo_type: String, + }, } #[derive(Subcommand)] @@ -231,7 +249,7 @@ enum Commands { Info { /// The path to the target binary #[arg(short, long, value_name = "FILENAME")] - path: Option, + path: Option, }, /// Generate processed data from extracted raw data Generate { @@ -242,11 +260,11 @@ enum Commands { Extract { /// The path to the dir or binary to be processed #[arg(short, long, value_name = "DIR")] - fpath: String, + fpath: PathBuf, /// The path for the output directory #[arg(short, long, value_name = "DIR")] - output_dir: String, + output_dir: PathBuf, /// The extraction mode #[arg(short, long, value_name = "EXTRACT_MODE", value_parser = clap::builder::PossibleValuesParser::new(["finfo", "reg", "cfg", "xrefs","cg"]) @@ -259,6 +277,12 @@ enum Commands { #[arg(long, default_value = "false")] debug: bool, + + #[arg(long, default_value = "false")] + extended_analysis: bool, + + #[arg(long, default_value = "true")] + use_curl_pdb: bool, }, /// Generate single embeddings on the fly /// @@ -295,11 +319,11 @@ enum DedupSubCommands { Cgs { /// The filename to dedup #[arg(short, long, value_name = "FILENAME")] - filename: String, + filename: PathBuf, /// Output path to save dedup corpus #[arg(short, long, value_name = "OUTPUT_PATH")] - output_path: String, + output_path: PathBuf, /// Number of threads to use with Rayon #[arg(short, long, value_name = "NUM_THREADS", default_value = "2")] @@ -314,16 +338,20 @@ enum DedupSubCommands { #[arg(long,value_parser = clap::builder::PossibleValuesParser::new(["cgmeta", "cgname", "tiknib"]) .map(|s| s.parse::().unwrap()), required = true)] node_feature_type: String, + + /// Toggle to remove inplace (i.e delete duplicates) + #[arg(long)] + inplace: bool, }, /// De-dup generate ESIL strings Esil { /// The filename to dedup #[arg(short, long, value_name = "FILENAME")] - filename: String, + filename: PathBuf, /// Output path to save dedup corpus #[arg(short, long, value_name = "OUTPUT_PATH")] - output_path: String, + output_path: PathBuf, /// Toggle to print statistics of number of functions before and after dedup #[arg(long, default_value = "false")] @@ -385,6 +413,7 @@ fn main() { "onehopcg" => DataType::OneHopCg, "cgcallers" => DataType::CgWithCallers, "onehopcgcallers" => DataType::OneHopCgWithcallers, + "globalcg" => DataType::GlobalCg, _ => DataType::Invalid, }; @@ -397,8 +426,8 @@ fn main() { warn!("The 'with_features' toggle is set but is not support for CFG generation. Will ignore.") }; - if !Path::new(path).exists() { - error!("{} does not exist!", path); + if !path.exists() { + error!("{:?} does not exist!", path); exit(1) } info!("Chosen Graph Type: {}", graph_data_type); @@ -441,7 +470,7 @@ fn main() { { if file.path().to_string_lossy().ends_with(".json") { agfj_graph_statistical_features( - file.path().to_str().unwrap(), + file.path(), &min_blocks.unwrap(), output_path, feature_vec_type, @@ -455,7 +484,7 @@ fn main() { #[cfg(feature = "inference")] if feature_vec_type == FeatureType::ModelEmbedded { if tokeniser_fp.is_none() || model_fp.is_none() { - println!("Both Tokeniser and Model filespaths are needed"); + println!("Both Tokenizer and Model file paths are needed"); exit(100) } else { agfj_graph_embedded_feats( @@ -474,290 +503,186 @@ fn main() { } else { error!("--feature-type/-f is required for creating CFG's") } - } else { - // If its only one file - if Path::new(path).is_file() { - let mut file = if *with_features { - if metadata_path.is_none() { - error!("with features active - require --metadata-path argument"); - exit(1) - }; + } else if Path::new(path).is_file() { + let mut file = match with_features { + true => { let mut metadata = AFIJFile { - filename: metadata_path.clone().unwrap(), + filename: metadata_path.as_ref().unwrap().to_path_buf(), function_info: None, - output_path: "".to_string(), + output_path: PathBuf::new(), }; + debug!("AFIJ Object: {:?}", metadata); metadata .load_and_deserialize() .expect("Unable to load file"); - let metadata_subset = metadata.subset(); + let metadata_subset = metadata.subset(false); AGCJFile { - filename: path.to_owned(), + filename: path.clone(), function_call_graphs: None, - output_path: output_path.to_owned(), + output_path: output_path.clone(), function_metadata: Some(metadata_subset), include_unk: *include_unk, } - } else { - AGCJFile { - filename: path.to_owned(), - function_call_graphs: None, - output_path: output_path.to_owned(), - function_metadata: None, - include_unk: *include_unk, - } - }; - file.load_and_deserialize() - .expect("Unable to load and desearilize JSON"); - if graph_data_type == DataType::Cg { - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone(), - ); - } - } else if graph_data_type == DataType::OneHopCg { - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.one_hop_to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone(), - ); - } - } else if graph_data_type == DataType::CgWithCallers { - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone(), - ); - } - } else if graph_data_type == DataType::OneHopCgWithcallers { - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.one_hop_to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, + } + false => AGCJFile { + filename: path.clone(), + function_call_graphs: None, + output_path: output_path.clone(), + function_metadata: None, + include_unk: *include_unk, + }, + }; + + file.load_and_deserialize() + .expect("Unable to load and deserialize JSON"); + file.process_based_on_graph_data_type( + graph_data_type, + with_features, + metadata_type.clone(), + ); + } else { + debug!("Multiple files found"); + + if metadata_path.is_none() & with_features { + error!("with features active - require --metadata-path argument"); + exit(1) + }; + + let mut file_paths_vec = get_json_paths_from_dir(path, Some("_cg".to_string())); + info!( + "{} files found. Beginning Processing.", + file_paths_vec.len() + ); + // if without metadata + if !with_features & metadata_type.is_none() { + debug!("Creating call graphs without any node features"); + file_paths_vec.par_iter().progress().for_each(|path| { + let suffix = graph_type.to_owned().to_string(); + let full_output_path = get_save_file_path( + &PathBuf::from(path), + output_path, + Some(suffix), + None, + ); + if !full_output_path.is_dir() { + let mut file = AGCJFile { + filename: path.to_owned().parse().unwrap(), + function_call_graphs: None, + output_path: output_path.to_owned(), + function_metadata: None, + include_unk: *include_unk, + }; + debug!("Processing {:?}", file.filename); + file.load_and_deserialize() + .expect("Unable to load and deserialize JSON"); + file.process_based_on_graph_data_type( + graph_data_type, with_features, - &file.include_unk, metadata_type.clone(), ); + } else { + info!( + "Skipping {} as already exists", + full_output_path.to_string_lossy() + ) } - } + }) } else { - debug!("Multiple files found"); - - if metadata_path.is_none() & with_features { + debug!("Creating call graphs with node features"); + debug!("Getting metadata file paths"); + // its more than one file + if metadata_path.is_none() { error!("with features active - require --metadata-path argument"); exit(1) }; - let mut file_paths_vec = - get_json_paths_from_dir(path, Some("_cg".to_string())); - info!( - "{} files found. Beginning Processing.", - file_paths_vec.len() + if with_features & metadata_type.is_none() { + error!("with features requires metadata_type to be set") + } + let mut metadata_paths_vec = get_json_paths_from_dir( + metadata_path.as_ref().unwrap(), + Some(metadata_type.as_ref().unwrap().to_string()), ); - // if without metadata - if !with_features & metadata_type.is_none() { - debug!("Creating call graphs without any node features"); - file_paths_vec.par_iter().progress().for_each(|path| { - let suffix = graph_type.to_owned().to_string(); - let full_output_path = PathBuf::from(get_save_file_path( - path, - output_path, - Some(suffix), - )); - if !full_output_path.is_dir() { - let mut file = AGCJFile { - filename: path.to_owned(), - function_call_graphs: None, - output_path: output_path.to_owned(), - function_metadata: None, - include_unk: *include_unk, - }; - debug!("Proceissing {}", file.filename); - file.load_and_deserialize() - .expect("Unable to load and desearilize JSON"); - - for fcg in file.function_call_graphs.as_ref().unwrap() { - match graph_data_type { - DataType::Cg => { - fcg.to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - None, - ); - } - DataType::OneHopCg => { - fcg.one_hop_to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - None, - ); - } - DataType::CgWithCallers => { - fcg.to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - None, - ); - } - DataType::OneHopCgWithcallers => { - fcg.one_hop_to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - None, - ); - } - _ => unreachable!("Not possible hopefully! :O"), - } - } - } else { - info!( - "Skipping {} as already exists", - full_output_path.to_string_lossy() - ) - } - }) - } else { - debug!("Creating call graphs with node features"); - debug!("Getting metadata file paths"); - // its more than one file - if metadata_path.is_none() { - error!("with features active - require --metadata-path argument"); - exit(1) - }; - if with_features & metadata_type.is_none() { - error!("with features requires metadata_type to be set") - } - let mut metadata_paths_vec = get_json_paths_from_dir( - metadata_path.as_ref().unwrap(), - Some(metadata_type.as_ref().unwrap().to_string()), - ); + file_paths_vec.sort(); + metadata_paths_vec.sort(); - file_paths_vec.sort(); - metadata_paths_vec.sort(); + assert_eq!(file_paths_vec.len(), metadata_paths_vec.len()); + let combined_cgs_metadata = file_paths_vec + .into_iter() + .zip(metadata_paths_vec) + .collect::>(); - assert_eq!(file_paths_vec.len(), metadata_paths_vec.len()); - let combined_cgs_metadata = file_paths_vec - .into_iter() - .zip(metadata_paths_vec) - .collect::>(); - - combined_cgs_metadata.par_iter().progress().for_each(|tup| { + combined_cgs_metadata.par_iter().progress().for_each( + |(filepath, metapath)| { let suffix = format!("{}-meta", graph_type.to_owned()); - let full_output_path = - PathBuf::from(get_save_file_path(&tup.0, output_path, Some(suffix))); + let full_output_path = get_save_file_path( + &PathBuf::from(filepath), + output_path, + Some(suffix), + None, + ); if !full_output_path.is_dir() { let mut file = { let metadata: Option; if metadata_type.clone().unwrap() == *"finfo" { let mut metadata_file = AFIJFile { - filename: tup.1.clone(), + filename: PathBuf::from(metapath), function_info: None, - output_path: "".to_string(), + output_path: PathBuf::new(), }; - debug!("Attempting to load metadata file: {}", tup.1); + debug!( + "Attempting to load metadata file: {}", + metapath + ); metadata_file .load_and_deserialize() .expect("Unable to load associated metadata file"); - metadata = Some(metadata_file.subset()); + metadata = Some(metadata_file.subset(false)); } else if metadata_type.clone().unwrap() == *"tiknib" { let mut metadata_file = TikNibFuncMetaFile { - filename: tup.1.clone(), + filename: PathBuf::from(metapath), function_info: None, - output_path: "".to_string(), + output_path: PathBuf::new(), }; - metadata_file.load_and_deserialize().expect("Unable to load associated metadata file"); + metadata_file + .load_and_deserialize() + .expect("Unable to load associated metadata file"); metadata = Some(metadata_file.subset()); - } else { + } else { metadata = None } AGCJFile { - filename: tup.0.to_owned(), + filename: PathBuf::from(filepath), function_call_graphs: None, output_path: output_path.to_owned(), function_metadata: metadata, include_unk: *include_unk, } }; - debug!("Attempting to load {}", file.filename); + debug!("Attempting to load {:?}", file.filename); file.load_and_deserialize() - .expect("Unable to load and desearilize JSON"); - - if graph_data_type == DataType::Cg { - debug!("Generating call graphs using loaded cgs + metadata"); - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone() - ); - } - } else if graph_data_type == DataType::OneHopCg { - debug!("Generating one hop call graphs using loaded cgs + metadata"); - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.one_hop_to_petgraph(&file, &file.output_path, &file.filename, with_features, &file.include_unk, metadata_type.clone()); - } - } else if graph_data_type == DataType::CgWithCallers { - debug!("Generating call graphs with callers using loaded cgs + metadata"); - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone() - ); - } - } else if graph_data_type == DataType::OneHopCgWithcallers { - debug!("Generating one hop call graphs with callers using loaded cgs + metadata"); - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.one_hop_to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone() - ); - } + .expect("Unable to load and deserialize JSON"); + + file.process_based_on_graph_data_type( + graph_data_type, + with_features, + metadata_type.clone(), + ); + debug!( + "Finished generating cgs + metadata for {:?}", + file.filename + ); + } else { + info!( + "Skipping {} as already exists", + full_output_path.to_string_lossy() + ) } - debug!("Finished generating cgs + metadata for {}", file.filename); - } else { - info!("Skipping {} as already exists", full_output_path.to_string_lossy()) - }}); - } + }, + ); } } } @@ -765,6 +690,7 @@ fn main() { input_path, output_path, data_source_type, + extended, } => { if data_source_type == "finfo" { let mut file = AFIJFile { @@ -776,22 +702,56 @@ fn main() { file.load_and_deserialize() .expect("Unable to load and desearilize JSON"); info!("Successfully loaded JSON"); - file.subset_and_save(); + file.subset_and_save(*extended); info!("Generation complete"); } else if data_source_type == "agfj" { warn!("This currently only supports making TikNib features for single files"); - let mut file = AGFJFile { - functions: None, - filename: input_path.to_owned(), - output_path: output_path.to_string(), - min_blocks: 1, // Dummy - feature_type: None, - architecture: None, - reg_norm: false, // Dummy - }; - file.load_and_deserialize().expect("Unable to load data"); - file.tiknib_func_level_feature_gen() + if input_path.is_file() { + let mut file = AGFJFile { + functions: None, + filename: input_path.to_owned(), + output_path: output_path.to_owned(), + min_blocks: 1, // Dummy + feature_type: None, + architecture: None, + reg_norm: false, // Dummy + }; + + file.load_and_deserialize().expect("Unable to load data"); + file.tiknib_func_level_feature_gen() + } else { + let file_paths_vec = + get_json_paths_from_dir(input_path, Some("_cfg".to_string())); + + file_paths_vec.par_iter().for_each(|filepath| { + let mut file = AGFJFile { + functions: None, + filename: filepath.to_owned().parse().unwrap(), + output_path: output_path.to_owned(), + min_blocks: 1, // Dummy + feature_type: None, + architecture: None, + reg_norm: false, // Dummy + }; + + file.load_and_deserialize().expect("Unable to load data"); + file.tiknib_func_level_feature_gen() + }); + } + } + } + GenerateSubCommands::Combos { + input_path, + output_path: _, + combo_type, + } => { + warn!("This feature is experimental and should be used with caution!"); + if combo_type == "finfo+tiknib" { + let _finfo_paths = + get_json_paths_from_dir(input_path, Some("_finfo".to_string())); + let _tiknib_paths = + get_json_paths_from_dir(input_path, Some("cfg-tiknib".to_string())); } } GenerateSubCommands::Nlp { @@ -831,7 +791,7 @@ fn main() { let file = AGFJFile { functions: None, filename: path.to_owned(), - output_path: data_out_path.to_string(), + output_path: data_out_path.to_owned(), min_blocks: *min_blocks, feature_type: None, architecture: None, @@ -849,8 +809,8 @@ fn main() { for file in file_paths_vec.iter().progress() { let file = AGFJFile { functions: None, - filename: file.to_string(), - output_path: data_out_path.to_string(), + filename: PathBuf::from(file), + output_path: data_out_path.to_owned(), min_blocks: *min_blocks, feature_type: None, architecture: None, @@ -892,14 +852,24 @@ fn main() { mode, num_threads, debug, + extended_analysis, + use_curl_pdb, } => { info!("Creating extraction job"); - let job = ExtractionJob::new(fpath, output_dir, mode).unwrap(); + let job = ExtractionJob::new( + fpath, + output_dir, + mode, + debug, + extended_analysis, + use_curl_pdb, + ) + .unwrap(); if job.input_path_type == PathType::Dir { info!("Directory found - will parallel process"); - info!("Creating threadpool with {} threads ", num_threads); + info!("Creating thread pool with {} threads ", num_threads); rayon::ThreadPoolBuilder::new() .num_threads(*num_threads) .build_global() @@ -912,7 +882,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_func_cfgs(debug)); + .for_each(|path| path.extract_func_cfgs()); } else if job.job_type == ExtractionJobType::RegisterBehaviour { info!("Extraction Job Type: Register Behaviour"); info!("Starting Parallel generation."); @@ -920,7 +890,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_register_behaviour(debug)); + .for_each(|path| path.extract_register_behaviour()); } else if job.job_type == ExtractionJobType::FunctionXrefs { info!("Extraction Job Type: Function Xrefs"); info!("Starting Parallel generation."); @@ -928,7 +898,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_function_xrefs(debug)); + .for_each(|path| path.extract_function_xrefs()); } else if job.job_type == ExtractionJobType::CallGraphs { info!("Extraction Job Type: Call Graphs"); info!("Starting Parallel generation."); @@ -936,7 +906,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_function_call_graphs(debug)); + .for_each(|path| path.extract_function_call_graphs()); } else if job.job_type == ExtractionJobType::FuncInfo { info!("Extraction Job Type: Function Info"); info!("Starting Parallel generation."); @@ -944,27 +914,27 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_function_info(debug)); + .for_each(|path| path.extract_function_info()); } } else if job.input_path_type == PathType::File { info!("Single file found"); if job.job_type == ExtractionJobType::CFG { info!("Extraction Job Type: CFG"); - job.files_to_be_processed[0].extract_func_cfgs(debug); + job.files_to_be_processed[0].extract_func_cfgs(); } else if job.job_type == ExtractionJobType::RegisterBehaviour { info!("Extraction Job Type: Register Behaviour"); - job.files_to_be_processed[0].extract_register_behaviour(debug) + job.files_to_be_processed[0].extract_register_behaviour() } else if job.job_type == ExtractionJobType::FunctionXrefs { info!("Extraction Job type: Function Xrefs"); - job.files_to_be_processed[0].extract_function_xrefs(debug) + job.files_to_be_processed[0].extract_function_xrefs() } else if job.job_type == ExtractionJobType::CallGraphs { info!("Extraction Job type: Function Call Graphs"); - job.files_to_be_processed[0].extract_function_call_graphs(debug) + job.files_to_be_processed[0].extract_function_call_graphs() } else if job.job_type == ExtractionJobType::FuncInfo { info!("Extraction Job type: Function Info"); - job.files_to_be_processed[0].extract_function_info(debug) + job.files_to_be_processed[0].extract_function_info() } - info!("Extraction complete for {}", fpath) + info!("Extraction complete for {:?}", fpath) } } @@ -989,22 +959,26 @@ fn main() { num_threads, filepath_format, node_feature_type, + inplace, } => { rayon::ThreadPoolBuilder::new() .num_threads(*num_threads) .build_global() .unwrap(); - warn!("This only supports the Cisco Talos Binary Sim Dataset naming convention"); if Path::new(filename).exists() { let node_feature_type = CallGraphNodeFeatureType::new(node_feature_type); info!("Starting duplication process for One Hop Call Graphs"); let corpus = CGCorpus::new(filename, output_path, filepath_format, node_feature_type) .unwrap(); - corpus.process_corpus(); + if *inplace { + corpus.process_corpus_inplace(); + } else { + corpus.process_corpus(); + } } else { - error!("Filename provided does not exist! - {}", filename) + error!("Filename provided does not exist! - {:?}", filename) } } DedupSubCommands::Esil { diff --git a/src/networkx.rs b/src/networkx.rs index be59509..7e7cc93 100644 --- a/src/networkx.rs +++ b/src/networkx.rs @@ -60,19 +60,12 @@ impl CallGraphNodeFeatureType { #[serde(rename_all = "camelCase")] pub struct GeminiNode { pub id: i64, - #[serde(rename = "num calls")] pub num_calls: f64, - #[serde(rename = "num transfer")] pub num_transfer: f64, - #[serde(rename = "num arith")] pub num_arith: f64, - #[serde(rename = "num ins")] pub num_ins: f64, - #[serde(rename = "numeric consts")] pub numeric_consts: f64, - #[serde(rename = "string consts")] pub string_consts: f64, - #[serde(rename = "num offspring")] pub num_offspring: f64, } @@ -95,21 +88,13 @@ impl From<(i64, &Vec)> for GeminiNode { #[serde(rename_all = "camelCase")] pub struct DGISNode { pub id: i64, - #[serde(rename = "num stack ops")] pub num_stack_ops: f64, - #[serde(rename = "num arith ops")] pub num_arith_ops: f64, - #[serde(rename = "num logic ops")] pub num_logic_ops: f64, - #[serde(rename = "num cmp ops")] pub num_cmp_ops: f64, - #[serde(rename = "num lib calls")] pub num_lib_calls: f64, - #[serde(rename = "num uncon jumps")] pub num_uncon_jumps: f64, - #[serde(rename = "num con jumps")] pub num_con_jumps: f64, - #[serde(rename = "num generic ins")] pub num_generic_ins: f64, } @@ -133,17 +118,11 @@ impl From<(i64, &Vec)> for DGISNode { #[serde(rename_all = "camelCase")] pub struct DiscovreNode { pub id: i64, - #[serde(rename = "num calls")] pub num_calls: f64, - #[serde(rename = "num transfer")] pub num_transfer: f64, - #[serde(rename = "num arith")] pub num_arith: f64, - #[serde(rename = "num ins")] pub num_ins: f64, - #[serde(rename = "numeric consts")] pub numeric_consts: f64, - #[serde(rename = "string consts")] pub string_consts: f64, } diff --git a/src/processors.rs b/src/processors.rs index 3c52366..6d10c09 100644 --- a/src/processors.rs +++ b/src/processors.rs @@ -6,6 +6,7 @@ use crate::files::AGFJFile; #[cfg(feature = "inference")] use crate::inference::InferenceJob; use serde::{Deserialize, Serialize}; +use std::path::{Path, PathBuf}; #[cfg(feature = "inference")] use std::process::exit; #[cfg(feature = "inference")] @@ -41,9 +42,9 @@ struct EdgePair { #[allow(clippy::too_many_arguments)] #[cfg(feature = "inference")] pub fn agfj_graph_embedded_feats( - path: &str, + path: &Path, min_blocks: &u16, - output_path: &str, + output_path: &Path, feature_type: FeatureType, tokeniser_fp: &Option, model_fp: &Option, @@ -53,7 +54,7 @@ pub fn agfj_graph_embedded_feats( let file = AGFJFile { functions: None, filename: path.to_owned(), - output_path: output_path.to_string(), + output_path: output_path.to_owned(), min_blocks: *min_blocks, feature_type: Some(feature_type), architecture: None, @@ -86,15 +87,15 @@ pub fn agfj_graph_embedded_feats( } pub fn agfj_graph_statistical_features( - path: &str, + path: &Path, min_blocks: &u16, - output_path: &str, + output_path: &PathBuf, feature_type: FeatureType, ) { let mut file = AGFJFile { functions: None, filename: path.to_owned(), - output_path: output_path.to_string(), + output_path: output_path.to_owned(), min_blocks: *min_blocks, feature_type: Some(feature_type), architecture: None, diff --git a/src/utils.rs b/src/utils.rs index 77fa795..495dcbc 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,5 +1,5 @@ use std::fs::create_dir_all; -use std::path::Path; +use std::path::PathBuf; use walkdir::WalkDir; /// Formats a save file path @@ -15,35 +15,49 @@ use walkdir::WalkDir; /// /// See agcj.rs for an example of this optional suffix being used pub fn get_save_file_path( - binary_path: &str, - output_path: &String, + binary_path: &PathBuf, + output_path: &PathBuf, optional_suffix: Option, -) -> String { + remove_suffix: Option, +) -> PathBuf { debug!( "Building Filepath - Binary Path: {:?} Output Path: {:?}", binary_path, output_path ); - let file_name = Path::new(binary_path) + let file_name = binary_path .file_stem() .unwrap() .to_string_lossy() .to_string(); + let file_name = if let Some(suffix) = remove_suffix { + file_name.replace(&suffix, "") + } else { + file_name + }; + + if optional_suffix.is_none() { let full_output_path = format!( "{}/{}", - output_path.strip_suffix('/').unwrap_or(output_path), + output_path + .to_string_lossy() + .strip_suffix('/') + .unwrap_or(output_path.as_os_str().to_str().unwrap()), file_name ); - full_output_path + PathBuf::from(full_output_path) } else { let full_output_path = format!( "{}/{}-{}", - output_path.strip_suffix('/').unwrap_or(output_path), + output_path + .to_string_lossy() + .strip_suffix('/') + .unwrap_or(output_path.as_os_str().to_str().unwrap()), file_name, optional_suffix.unwrap() ); - full_output_path + PathBuf::from(full_output_path) } } @@ -53,7 +67,7 @@ pub fn get_save_file_path( /// files present within identifying files ending in .json before /// returning a Vec where each string is an absolute path /// to a given JSON file -pub fn get_json_paths_from_dir(path: &String, identifier: Option) -> Vec { +pub fn get_json_paths_from_dir(path: &PathBuf, identifier: Option) -> Vec { let mut str_vec: Vec = Vec::new(); let pattern = if identifier.is_none() { ".json".to_string() @@ -72,8 +86,8 @@ pub fn get_json_paths_from_dir(path: &String, identifier: Option) -> Vec } /// Checks to see if a directory is prsent, if not creates -pub fn check_or_create_dir(full_output_path: &String) { - if !Path::new(full_output_path).is_dir() { +pub fn check_or_create_dir(full_output_path: &PathBuf) { + if !full_output_path.is_dir() { create_dir_all(full_output_path).expect("Unable to create directory!"); } } @@ -89,23 +103,36 @@ mod tests { // TESTS FOR SAVE PATH BUILDING #[test] fn test_get_save_file_path_1() { - let path: &str = "test_bin/hello.json"; - let output_path: String = String::from("processed_data/"); - let output_path = get_save_file_path(path, &output_path, Some("cg".to_string())); - assert_eq!(output_path, String::from("processed_data/hello-cg")) + let path: &PathBuf = &PathBuf::from("test_bin/hello.json"); + let output_path: &PathBuf = &PathBuf::from("processed_data/"); + let output_path = get_save_file_path(path, &output_path, Some("cg".to_string()), None); + assert_eq!(output_path, PathBuf::from("processed_data/hello-cg")) } #[test] fn test_get_save_file_path_2() { - let path: &str = "test_bin/extra_dir/hello.json"; - let output_path: String = String::from("with_more/processed_data/"); - let output = get_save_file_path(path, &output_path, None); - assert_eq!(output, String::from("with_more/processed_data/hello")) + let path: &PathBuf = &PathBuf::from("test_bin/extra_dir/hello.json"); + let output_path: &PathBuf = &PathBuf::from("with_more/processed_data/"); + let output = get_save_file_path(path, output_path, None, None); + assert_eq!(output, PathBuf::from("with_more/processed_data/hello")) } #[test] fn test_get_save_file_path_3() { - let path: &str = "hello.json"; - let output_path: String = String::from("processed_data"); - let output = get_save_file_path(path, &output_path, None); - assert_eq!(output, String::from("processed_data/hello")) + let path: &PathBuf = &PathBuf::from("hello.json"); + let output_path: &PathBuf = &PathBuf::from("processed_data"); + let output = get_save_file_path(path, &output_path, None, None); + assert_eq!(output, PathBuf::from("processed_data/hello")) + } + + #[test] + fn test_get_save_file_path_with_suffix_removal() { + let path: &PathBuf = &PathBuf::from("hello_cg.json"); + let output_path: &PathBuf = &PathBuf::from("processed_data"); + let output = get_save_file_path( + path, + &output_path, + Some("gcg".to_string()), + Some("_cg".to_string()), + ); + assert_eq!(output, PathBuf::from("processed_data/hello-gcg")) } }