From 967a9f425a4722c4eafd8a90ed23c1f6c5a1d3b2 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Mon, 29 Jan 2024 10:39:40 +0000 Subject: [PATCH 01/40] removing useless renames --- src/networkx.rs | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/networkx.rs b/src/networkx.rs index be59509..7e7cc93 100644 --- a/src/networkx.rs +++ b/src/networkx.rs @@ -60,19 +60,12 @@ impl CallGraphNodeFeatureType { #[serde(rename_all = "camelCase")] pub struct GeminiNode { pub id: i64, - #[serde(rename = "num calls")] pub num_calls: f64, - #[serde(rename = "num transfer")] pub num_transfer: f64, - #[serde(rename = "num arith")] pub num_arith: f64, - #[serde(rename = "num ins")] pub num_ins: f64, - #[serde(rename = "numeric consts")] pub numeric_consts: f64, - #[serde(rename = "string consts")] pub string_consts: f64, - #[serde(rename = "num offspring")] pub num_offspring: f64, } @@ -95,21 +88,13 @@ impl From<(i64, &Vec)> for GeminiNode { #[serde(rename_all = "camelCase")] pub struct DGISNode { pub id: i64, - #[serde(rename = "num stack ops")] pub num_stack_ops: f64, - #[serde(rename = "num arith ops")] pub num_arith_ops: f64, - #[serde(rename = "num logic ops")] pub num_logic_ops: f64, - #[serde(rename = "num cmp ops")] pub num_cmp_ops: f64, - #[serde(rename = "num lib calls")] pub num_lib_calls: f64, - #[serde(rename = "num uncon jumps")] pub num_uncon_jumps: f64, - #[serde(rename = "num con jumps")] pub num_con_jumps: f64, - #[serde(rename = "num generic ins")] pub num_generic_ins: f64, } @@ -133,17 +118,11 @@ impl From<(i64, &Vec)> for DGISNode { #[serde(rename_all = "camelCase")] pub struct DiscovreNode { pub id: i64, - #[serde(rename = "num calls")] pub num_calls: f64, - #[serde(rename = "num transfer")] pub num_transfer: f64, - #[serde(rename = "num arith")] pub num_arith: f64, - #[serde(rename = "num ins")] pub num_ins: f64, - #[serde(rename = "numeric consts")] pub numeric_consts: f64, - #[serde(rename = "string consts")] pub string_consts: f64, } From e03d24176a100ea843e0ba58392abec2bb681ab1 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Mon, 29 Jan 2024 10:41:30 +0000 Subject: [PATCH 02/40] suggested refactor to reduce un-needed clone --- src/agfj.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/agfj.rs b/src/agfj.rs index 83c52aa..5c8de4d 100644 --- a/src/agfj.rs +++ b/src/agfj.rs @@ -333,12 +333,12 @@ impl AGFJFunc { check_or_create_dir(&full_output_path); let file_name = path.split('/').last().unwrap(); let binary_name: Vec<_> = file_name.split(".j").collect(); - let mut function_name = self.name.clone(); - // This is a pretty dirty fix and may break things - if function_name.chars().count() > 100 { - function_name = self.name[..75].to_string(); - } + let function_name = if self.name.chars().count() > 100 { + &self.name[..75] + } else { + &self.name + }; let fname_string = format!( "{}/{}-{}.json", From 880052875fa395ea034433a5fbc27624681086d9 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Mon, 29 Jan 2024 10:41:44 +0000 Subject: [PATCH 03/40] adding test deps --- src/dedup.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/dedup.rs b/src/dedup.rs index 2ffbc1b..80d1b62 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -496,6 +496,14 @@ impl CGCorpus { } mod tests { + use crate::dedup::CGCorpus; + use crate::networkx::{ + CallGraphFuncWithMetadata, CallGraphNodeFeatureType, CallGraphTypes, NetworkxDiGraph, + }; + use std::fs; + use std::fs::read_to_string; + use std::path::Path; + use walkdir::WalkDir; // Test Dedup on typed CG's #[test] From a990b3d1c396f6101db4b34b401a371069c8b896 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Mon, 29 Jan 2024 13:17:11 +0000 Subject: [PATCH 04/40] refactoring all paths from string to PathBuf --- src/agcj.rs | 53 +++++----- src/agfj.rs | 41 ++++---- src/binnfo.rs | 7 +- src/dedup.rs | 251 +++++++++++++++++++++++----------------------- src/extract.rs | 82 ++++++++------- src/files.rs | 52 +++++----- src/main.rs | 89 ++++++++-------- src/processors.rs | 13 +-- src/utils.rs | 50 +++++---- 9 files changed, 331 insertions(+), 307 deletions(-) diff --git a/src/agcj.rs b/src/agcj.rs index 0c24c8e..ddab388 100644 --- a/src/agcj.rs +++ b/src/agcj.rs @@ -7,6 +7,7 @@ use itertools::Itertools; use petgraph::prelude::Graph; use serde::{Deserialize, Serialize}; use std::fs::File; +use std::path::PathBuf; #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] @@ -27,8 +28,8 @@ pub struct AGCJParsedObjects { impl AGCJFunctionCallGraphs { fn graph_to_json_func_node( &self, - binary_name: &str, - output_path: &String, + binary_name: &PathBuf, + output_path: &PathBuf, networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { @@ -44,7 +45,7 @@ impl AGCJFunctionCallGraphs { } let filename = format!( - "{}/{}-{}.json", + "{:?}/{}-{}.json", full_output_path, function_name, type_suffix ); @@ -57,8 +58,8 @@ impl AGCJFunctionCallGraphs { fn graph_to_json_func_metadata_tiknib( &self, - binary_name: &str, - output_path: &String, + binary_name: &PathBuf, + output_path: &PathBuf, networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { @@ -74,7 +75,7 @@ impl AGCJFunctionCallGraphs { } let filename = format!( - "{}/{}-{}.json", + "{:?}/{}-{}.json", full_output_path, function_name, type_suffix ); @@ -87,8 +88,8 @@ impl AGCJFunctionCallGraphs { fn graph_to_json_func_metadata_finfo( &self, - binary_name: &str, - output_path: &String, + binary_name: &PathBuf, + output_path: &PathBuf, networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { @@ -104,7 +105,7 @@ impl AGCJFunctionCallGraphs { } let filename = format!( - "{}/{}-{}.json", + "{:?}/{}-{}.json", full_output_path, function_name, type_suffix ); @@ -226,8 +227,8 @@ impl AGCJFunctionCallGraphs { pub fn to_petgraph( &self, global_cg: &AGCJFile, - output_path: &String, - binary_name: &str, + output_path: &PathBuf, + binary_name: &PathBuf, with_metadata: &bool, include_unk: &bool, node_feature_type: Option, @@ -250,8 +251,8 @@ impl AGCJFunctionCallGraphs { pub fn one_hop_to_petgraph( &self, global_cg: &AGCJFile, - output_path: &String, - binary_name: &str, + output_path: &PathBuf, + binary_name: &PathBuf, with_metadata: &bool, include_unk: &bool, node_feature_type: Option, @@ -273,8 +274,8 @@ impl AGCJFunctionCallGraphs { pub fn to_petgraph_with_callers( &self, global_cg: &AGCJFile, - output_path: &String, - binary_name: &str, + output_path: &PathBuf, + binary_name: &PathBuf, with_metadata: &bool, include_unk: &bool, node_feature_type: Option, @@ -296,8 +297,8 @@ impl AGCJFunctionCallGraphs { pub fn one_hop_to_petgraph_with_callers( &self, global_cg: &AGCJFile, - output_path: &String, - binary_name: &str, + output_path: &PathBuf, + binary_name: &PathBuf, with_metadata: &bool, include_unk: &bool, node_feature_type: Option, @@ -327,8 +328,8 @@ impl AGCJFunctionCallGraphs { &self, graph: Graph, global_cg: &AGCJFile, - binary_name: &str, - output_path: &String, + binary_name: &PathBuf, + output_path: &PathBuf, with_metadata: &bool, node_feature_type: Option, type_suffix: &str, @@ -384,13 +385,13 @@ impl AGCJFunctionCallGraphs { #[cfg(test)] mod tests { use crate::files::AGCJFile; - use env_logger; + use std::path::PathBuf; fn return_test_file_oject() -> AGCJFile { let mut call_graph_file = AGCJFile { - filename: "test-files/ls_cg.json".to_string(), + filename: PathBuf::from("test-files/ls_cg.json"), function_call_graphs: None, - output_path: "".to_string(), + output_path: PathBuf::new(), function_metadata: None, include_unk: false, }; @@ -402,7 +403,7 @@ mod tests { } #[test] fn test_function_call_graph_without_unks() { - let mut call_graph_file = return_test_file_oject(); + let call_graph_file = return_test_file_oject(); // Get main function - No Unks let raw_call_graph_data = &call_graph_file.function_call_graphs.clone().unwrap()[0]; @@ -417,7 +418,7 @@ mod tests { #[test] fn test_function_call_graph_with_callees_without_unks() { - let mut call_graph_file = return_test_file_oject(); + let call_graph_file = return_test_file_oject(); // Unk False let raw_call_graph_data = &call_graph_file.function_call_graphs.clone().unwrap()[0]; @@ -452,7 +453,7 @@ mod tests { #[test] fn test_function_call_graph_with_callees_with_unks() { - let mut call_graph_file = return_test_file_oject(); + let call_graph_file = return_test_file_oject(); // sym.func.100004d11 - One unknown let raw_call_graph_data = &call_graph_file.function_call_graphs.clone().unwrap()[2]; @@ -471,7 +472,7 @@ mod tests { #[test] fn test_function_call_graph_callees_and_callers_with_unks() { - let mut call_graph_file = return_test_file_oject(); + let call_graph_file = return_test_file_oject(); // sym.func.100004d11 - One unknown let raw_call_graph_data = &call_graph_file.function_call_graphs.clone().unwrap()[2]; diff --git a/src/agfj.rs b/src/agfj.rs index 5c8de4d..7f10e2e 100644 --- a/src/agfj.rs +++ b/src/agfj.rs @@ -9,8 +9,10 @@ use petgraph::prelude::Graph; use petgraph::visit::Dfs; use serde::{Deserialize, Serialize}; use serde_json; +#[cfg(feature = "inference")] +use serde_json::{Map, Value}; use std::fs::File; -use std::path::Path; +use std::path::{Path, PathBuf}; #[cfg(feature = "inference")] use std::process::exit; #[cfg(feature = "inference")] @@ -239,9 +241,9 @@ impl AGFJFunc { #[cfg(feature = "inference")] pub fn generate_embedded_cfg( &self, - path: &str, + path: &PathBuf, min_blocks: &u16, - output_path: &String, + output_path: &PathBuf, feature_type: FeatureType, inference_job: &Option>, ) { @@ -303,11 +305,11 @@ impl AGFJFunc { ) }; - let file_name = path.split('/').last().unwrap(); + let file_name = path.file_name().unwrap(); let binary_name: Vec<_> = file_name.split(".j").collect(); let fname_string = format!( - "{}/{}-{}.json", + "{:?}/{:?}-{}.json", &full_output_path, binary_name[0], self.name ); serde_json::to_writer( @@ -323,16 +325,18 @@ impl AGFJFunc { pub fn generate_attributed_cfg( &self, - path: &str, + path: &PathBuf, min_blocks: &u16, - output_path: &String, + output_path: &PathBuf, feature_type: FeatureType, architecture: &String, ) { let full_output_path = get_save_file_path(path, output_path, None); check_or_create_dir(&full_output_path); - let file_name = path.split('/').last().unwrap(); - let binary_name: Vec<_> = file_name.split(".j").collect(); + let file_name = path.file_name().unwrap(); + let binding = file_name.to_string_lossy().to_string(); + + let binary_name: Vec<_> = binding.split(".j").collect(); let function_name = if self.name.chars().count() > 100 { &self.name[..75] @@ -341,7 +345,7 @@ impl AGFJFunc { }; let fname_string = format!( - "{}/{}-{}.json", + "{:?}/{:?}-{}.json", &full_output_path, binary_name[0], function_name ); @@ -541,6 +545,7 @@ impl From<(&String, Vec)> for TikNibFunc { #[cfg(test)] mod tests { use crate::bb::FeatureType; + use std::path::PathBuf; use crate::AGFJFile; @@ -551,11 +556,11 @@ mod tests { #[test] fn file_struct_creation() { - let file_path = "../sample-tool-outputs/r2/example_agfj@@F_output.json".to_string(); + let file_path = PathBuf::from("../sample-tool-outputs/r2/example_agfj@@F_output.json"); let file = AGFJFile { functions: None, filename: file_path.to_owned(), - output_path: "output.json".to_string(), + output_path: PathBuf::from("output.json"), min_blocks: 5, feature_type: Some(crate::bb::FeatureType::Gemini), architecture: None, @@ -565,20 +570,20 @@ mod tests { assert!(file.functions.is_none()); assert_eq!( file.filename, - "../sample-tool-outputs/r2/example_agfj@@F_output.json".to_string() + PathBuf::from("../sample-tool-outputs/r2/example_agfj@@F_output.json") ); - assert_eq!(file.output_path, "output.json".to_string()); + assert_eq!(file.output_path, PathBuf::from("output.json")); assert_eq!(file.min_blocks, 5); assert_eq!(file.feature_type, Some(FeatureType::Gemini)); } #[test] fn test_file_load_and_desearlize() { - let file_path = "test-files/r2-output-samples/example_agfj@@F_output.json".to_string(); + let file_path = PathBuf::from("test-files/r2-output-samples/example_agfj@@F_output.json"); let mut file = AGFJFile { functions: None, filename: file_path.to_owned(), - output_path: "output.json".to_string(), + output_path: PathBuf::from("output.json"), min_blocks: 5, feature_type: Some(crate::bb::FeatureType::Gemini), architecture: None, @@ -653,11 +658,11 @@ mod tests { #[test] fn test_func_edge_list_generation() { - let file_path = "test-files/r2-output-samples/test_bin_agfj.json".to_string(); + let file_path = PathBuf::from("test-files/r2-output-samples/test_bin_agfj.json"); let mut file = AGFJFile { functions: None, filename: file_path.to_owned(), - output_path: "output.json".to_string(), + output_path: PathBuf::from("output.json"), min_blocks: 5, feature_type: Some(crate::bb::FeatureType::Gemini), architecture: None, diff --git a/src/binnfo.rs b/src/binnfo.rs index ce953c6..e020581 100644 --- a/src/binnfo.rs +++ b/src/binnfo.rs @@ -1,10 +1,9 @@ use goblin::{error, Object}; use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; -pub fn goblin_info(fpath: &str) -> error::Result<()> { - let path = Path::new(fpath); - let buffer = fs::read(path)?; +pub fn goblin_info(fpath: &PathBuf) -> error::Result<()> { + let buffer = fs::read(fpath)?; match Object::parse(&buffer)? { Object::Elf(elf) => { println!("elf: {:#?}", &elf); diff --git a/src/dedup.rs b/src/dedup.rs index 80d1b62..b06eb48 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -11,7 +11,7 @@ use std::collections::hash_map::DefaultHasher; use std::collections::{HashMap, HashSet}; use std::fs::{read_to_string, File}; use std::hash::{Hash, Hasher}; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::string::String; use std::{fs, vec}; @@ -89,12 +89,12 @@ pub struct EsilFuncStringCorpus { pub binary_name_index: Vec, pub uniq_binaries: Vec, pub arch_index: Vec, - pub output_path: String, + pub output_path: PathBuf, } /// A collection of processed Esil Function String files impl EsilFuncStringCorpus { - pub fn new(directory: &String, output_path: &String) -> Result { + pub fn new(directory: &PathBuf, output_path: &PathBuf) -> Result { let mut filepaths = Vec::new(); let mut binary_name_index = Vec::new(); let mut uniq_binaries = Vec::new(); @@ -123,10 +123,9 @@ impl EsilFuncStringCorpus { } } - let output_path: String = if !output_path.ends_with('/') { - format!("{}{}", output_path, "/") - } else { - output_path.to_string() + let mut output_path = output_path.to_owned(); + if !output_path.to_string_lossy().to_string().ends_with("/") { + output_path.push("/"); }; Ok(EsilFuncStringCorpus { @@ -135,7 +134,7 @@ impl EsilFuncStringCorpus { binary_name_index, uniq_binaries, arch_index, - output_path, + output_path: output_path.to_owned(), }) } @@ -278,7 +277,7 @@ impl EsilFuncStringCorpus { if !just_stats { let uniques_to_drop = json!(unique_func_hash_tuples); - let fname_string = format!("{}{}-dedup.json", self.output_path, &target_binary_name); + let fname_string = format!("{:?}{}-dedup.json", self.output_path, &target_binary_name); serde_json::to_writer( &File::create(fname_string).expect("Failed to create writer"), &uniques_to_drop, @@ -291,25 +290,25 @@ impl EsilFuncStringCorpus { /// Struct and Impl for de-duplicating Call Graph Corpus's #[derive(Debug)] pub struct CGCorpus { - pub filepaths: Vec, - pub output_path: String, + pub filepaths: Vec, + pub output_path: PathBuf, pub filepath_format: String, pub node_type: CallGraphNodeFeatureType, } impl CGCorpus { pub fn new( - directory: &String, - output_path: &String, + directory: &PathBuf, + output_path: &PathBuf, filepath_format: &String, node_type: CallGraphNodeFeatureType, ) -> Result { - if !Path::new(output_path).exists() { + if output_path.exists() { fs::create_dir(output_path).expect("Failed to create output directory!"); - info!("Output path not found - Creating {}", output_path) + info!("Output path not found - Creating {:?}", output_path) } - let mut filepaths = Vec::new(); + let mut filepaths: Vec = Vec::new(); // Load all JSON filepaths for file in WalkDir::new(directory) @@ -317,21 +316,16 @@ impl CGCorpus { .filter_map(|file| file.ok()) { if file.path().to_string_lossy().ends_with(".json") { - filepaths.push(file.clone().path().to_string_lossy().to_string()); + filepaths.push(PathBuf::from(file.clone().path())); } } info!("Returning One Hop CG Corpus Struct"); - - let output_path = if output_path.ends_with('/') { - output_path.to_owned() - } else { - output_path.to_owned() + &*"/".to_string() - }; + let output_path = output_path.to_owned(); Ok(CGCorpus { filepaths, - output_path: output_path.to_string(), + output_path: output_path, filepath_format: filepath_format.to_string(), node_type, }) @@ -344,7 +338,7 @@ impl CGCorpus { } //fn dedup_corpus(data: &mut Vec>, filepaths: &mut Vec) { - fn dedup_corpus(data: &mut Vec>, filepaths: &mut Vec) { + fn dedup_corpus(data: &mut Vec>, filepaths: &mut Vec) { debug!("Creating the removal index"); let mut seen = HashSet::new(); @@ -365,29 +359,31 @@ impl CGCorpus { } } - fn get_binary_name_cisco(filepath: &String) -> String { + fn get_binary_name_cisco(filepath: &PathBuf) -> PathBuf { // Example: x86-gcc-9-O3_nping_cg-onehopcgcallers-meta let binary_intermediate = Path::new(filepath).parent().unwrap().file_name().unwrap(); - binary_intermediate - .to_string_lossy() - .split('_') - .nth(1) - .unwrap() - .to_string() - } - fn get_binary_name_binkit(filepath: &String) -> String { + PathBuf::from( + binary_intermediate + .to_string_lossy() + .split('_') + .nth(1) + .unwrap(), + ) + } + fn get_binary_name_binkit(filepath: &PathBuf) -> PathBuf { // Example: tar-1.34_gcc-8.2.0_x86_32_O3_rmt_cg-onehopcgcallers-meta let binary_intermediate = Path::new(filepath).parent().unwrap().file_name().unwrap(); - binary_intermediate - .to_string_lossy() - .split('_') - .rev() - .nth(1) - .unwrap() - .to_string() + PathBuf::from( + binary_intermediate + .to_string_lossy() + .split('_') + .rev() + .nth(1) + .unwrap(), + ) } - fn extract_binary_from_fps(&self) -> Vec { + fn extract_binary_from_fps(&self) -> Vec { let mut fp_binaries = Vec::new(); // Process the file paths to get the associated binary of each path info!("Processing Filepaths to get binaries"); @@ -404,10 +400,10 @@ impl CGCorpus { fp_binaries } - fn get_unique_binary_fps(&self, fp_binaries: Vec) -> Vec> { + fn get_unique_binary_fps(&self, fp_binaries: Vec) -> Vec> { // Generate binary specific filepath vectors - let unique_binaries: Vec<_> = fp_binaries.iter().unique().collect(); - let mut unique_binaries_fps: Vec> = vec![Vec::new(); unique_binaries.len()]; + let unique_binaries: Vec<&PathBuf> = fp_binaries.iter().unique().collect(); + let mut unique_binaries_fps: Vec> = vec![Vec::new(); unique_binaries.len()]; for (file, binary) in self.filepaths.iter().zip(fp_binaries.iter()) { unique_binaries_fps[unique_binaries.iter().position(|&x| x == binary).unwrap()] @@ -417,13 +413,13 @@ impl CGCorpus { unique_binaries_fps } - fn load_subset(&self, fp_subset: &[String]) -> Vec> { + fn load_subset(&self, fp_subset: &Vec) -> Vec> { let mut subset_loaded_data = Vec::new(); for ele in fp_subset.iter() { let data = read_to_string(ele).expect(&format!("Unable to read file - {:?}", ele)); let json = serde_json::from_str::(&data) - .expect(&format!("Unable to load function data from {}", ele)); + .expect(&format!("Unable to load function data from {:?}", ele)); let nodes_empty = match self.node_type { CallGraphNodeFeatureType::CGName => json.as_cg_name().unwrap().nodes.is_empty(), @@ -437,11 +433,10 @@ impl CGCorpus { subset_loaded_data.push(None) } } - println!("{:?}", subset_loaded_data); subset_loaded_data } - pub fn process_corpus(self) { + pub fn process_corpus(&self) { let fp_binaries = self.extract_binary_from_fps(); // Generate binary specific filepath vectors @@ -464,30 +459,32 @@ impl CGCorpus { debug!("File processing complete - {}", idx); }); } - pub fn save_corpus(&self, subset_loaded_data: Vec, fp_subset: &mut [String]) { + + fn generate_dedup_filepath(output_path: &PathBuf, filepath: &PathBuf) -> PathBuf { + let first_two = filepath.components().rev().take(2).collect::>(); + let first_two: PathBuf = first_two.iter().rev().collect(); + let output = output_path.clone(); + let mut final_path = PathBuf::new(); + final_path.push(output); + final_path.push(first_two); + + final_path + } + pub fn save_corpus( + &self, + subset_loaded_data: Vec, + fp_subset: &mut Vec, + ) { subset_loaded_data .iter() .zip(fp_subset.iter()) .for_each(|(data_ele, filepath)| { - let fixed_path: Vec<_> = Path::new(filepath) - .components() - .rev() - .take(2) - .collect::>(); - trace!("Fixed Path (First Pass): {:?}", fixed_path); - let fixed_path = fixed_path - .iter() - .map(|c| c.as_os_str().to_string_lossy().to_string()) - .rev() - .collect::>(); - trace!("Fixed Path (Second Pass): {:?}", fixed_path); - let dirs = format!("{}{}", self.output_path, fixed_path[0]); + let save_path = Self::generate_dedup_filepath(&self.output_path, filepath); + let dirs = save_path.parent().unwrap_or(Path::new("")); fs::create_dir_all(&dirs).expect("Failed to create output directory!"); - let fixed_path = format!("{}/{}", dirs, fixed_path[1]); - trace!("Fixed Path (Final Pass): {:?}", fixed_path); serde_json::to_writer( - &File::create(fixed_path).expect("Failed to create writer"), + &File::create(save_path).expect("Failed to create writer"), &data_ele, ) .expect("Unable to write JSON"); @@ -502,7 +499,7 @@ mod tests { }; use std::fs; use std::fs::read_to_string; - use std::path::Path; + use std::path::{Path, PathBuf}; use walkdir::WalkDir; // Test Dedup on typed CG's @@ -510,15 +507,15 @@ mod tests { fn test_cg_corpus_gen() { // CG Corpus Generation let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &mut PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGName, ); assert_eq!(corpus.as_ref().unwrap().filepaths.len(), 12); assert_eq!( corpus.as_ref().unwrap().output_path, - "test-files/cg_dedup/deduped/".to_string() + PathBuf::from("test-files/cg_dedup/deduped/") ); assert_eq!( corpus.as_ref().unwrap().filepath_format, @@ -526,15 +523,15 @@ mod tests { ); let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped/".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &PathBuf::from("test-files/cg_dedup/deduped/"), &"cisco".to_string(), CallGraphNodeFeatureType::CGName, ); assert_eq!(corpus.as_ref().unwrap().filepaths.len(), 12); assert_eq!( corpus.as_ref().unwrap().output_path, - "test-files/cg_dedup/deduped/".to_string() + PathBuf::from("test-files/cg_dedup/deduped/") ); assert_eq!( corpus.as_ref().unwrap().filepath_format, @@ -545,8 +542,8 @@ mod tests { #[test] fn test_extract_binary_from_fps() { let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGMeta, ); @@ -556,18 +553,18 @@ mod tests { assert_eq!( fp_binaries, vec![ - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin".to_string(), - "testbin2".to_string(), - "testbin2".to_string(), - "testbin2".to_string(), - "testbin2".to_string(), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin2"), + PathBuf::from("testbin2"), + PathBuf::from("testbin2"), + PathBuf::from("testbin2"), ] ) } @@ -575,8 +572,8 @@ mod tests { #[test] fn test_get_unique_binary_fps() { let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &mut PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGMeta, ) @@ -592,8 +589,8 @@ mod tests { #[test] fn test_processing_unique_binary_collection() { let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &mut PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGMeta, ) @@ -612,8 +609,8 @@ mod tests { #[test] fn test_dedup_binary_subset() { let corpus = CGCorpus::new( - &"test-files/cg_dedup/to_dedup".to_string(), - &"test-files/cg_dedup/deduped".to_string(), + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &mut PathBuf::from("test-files/cg_dedup/deduped"), &"cisco".to_string(), CallGraphNodeFeatureType::CGMeta, ) @@ -689,7 +686,9 @@ mod tests { } // clean up - fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + } } // Test binary name extraction @@ -697,24 +696,24 @@ mod tests { fn test_binkit_binary_extraction() { assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"which-2.21_gcc-9.4.0_arm_32_O2_which_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string() + &PathBuf::from("which-2.21_gcc-9.4.0_arm_32_O2_which_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +") ), - "which" + PathBuf::from("which") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"recutils-1.9_gcc-11.2.0_mips_64_O3_recins_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string() + &PathBuf::from("recutils-1.9_gcc-11.2.0_mips_64_O3_recins_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +") ), - "recins" + PathBuf::from("recins") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"recutils-1.9_gcc-11.2.0_mips_64_O3_recsel_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string(), + &PathBuf::from("recutils-1.9_gcc-11.2.0_mips_64_O3_recsel_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +"), ), - "recsel", + PathBuf::from("recsel"), ); } @@ -722,30 +721,30 @@ mod tests { fn test_cisco_binary_extraction() { assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm64-clang-9-Os_curl_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm64-clang-9-Os_curl_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json") ), - "curl" + PathBuf::from("curl") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"x86-clang-9-Os_libcrypto.so.3_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string() + &PathBuf::from("x86-clang-9-Os_libcrypto.so.3_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +") ), - "libcrypto.so.3" + PathBuf::from("libcrypto.so.3") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"x86-gcc-9-O3_unrar_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string(), + &PathBuf::from("x86-gcc-9-O3_unrar_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +"), ), - "unrar", + PathBuf::from("unrar"), ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"/random/path/before/x86-gcc-9-O3_unrar_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json -".to_string(), + &PathBuf::from("/random/path/before/x86-gcc-9-O3_unrar_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json +"), ), - "unrar", + PathBuf::from("unrar"), ); } @@ -753,37 +752,37 @@ mod tests { fn test_trex_binary_extraction() { assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm-32_binutils-2.34-O0_elfedit_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm-32_binutils-2.34-O0_elfedit_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json") ), - "elfedit" + PathBuf::from("elfedit") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm-32_binutils-2.34-O0_objdump_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm-32_binutils-2.34-O0_objdump_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json") ), - "objdump" + PathBuf::from("objdump") ); assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm-32_binutils-2.34-O0_nm-new_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm-32_binutils-2.34-O0_nm-new_cg-onehopcgcallers-meta/sym.dummy-func-onehopcgcallers-meta.json") ), - "nm-new" + PathBuf::from("nm-new") ); // __ for c++ bins that sometimes crop up assert_eq!( crate::dedup::CGCorpus::get_binary_name_binkit( - &"arm-32_binutils-2.34-O0_nm-new_cg-onehopcgcallers-meta/sym.dummy___func__-onehopcgcallers-meta.json".to_string() + &PathBuf::from("arm-32_binutils-2.34-O0_nm-new_cg-onehopcgcallers-meta/sym.dummy___func__-onehopcgcallers-meta.json") ), - "nm-new" + PathBuf::from("nm-new") ); assert_eq!( - crate::dedup::CGCorpus::get_binary_name_binkit(&"fast-disk/Dataset-2/cgs/x86-32_coreutils-8.32-O1_stat_cg-onehopcgcallers-meta/main-onehopcgcallers-meta.json".to_string()), - "stat" + crate::dedup::CGCorpus::get_binary_name_binkit(&PathBuf::from("fast-disk/Dataset-2/cgs/x86-32_coreutils-8.32-O1_stat_cg-onehopcgcallers-meta/main-onehopcgcallers-meta.json")), + PathBuf::from("stat") ); - assert_eq!(crate::dedup::CGCorpus::get_binary_name_binkit(&"/fast-disk/processed_datasets/Dataset-2/arm-32_binutils-2.34-O0_addr2line_cg-onehopcgcallers-meta/sym.adjust_relative_path-onehopcgcallers-meta.json".to_string()), - "addr2line") + assert_eq!(crate::dedup::CGCorpus::get_binary_name_binkit(&PathBuf::from("/fast-disk/processed_datasets/Dataset-2/arm-32_binutils-2.34-O0_addr2line_cg-onehopcgcallers-meta/sym.adjust_relative_path-onehopcgcallers-meta.json")), + PathBuf::from("addr2line")) } } diff --git a/src/extract.rs b/src/extract.rs index 8d01bd3..f52253b 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -12,7 +12,7 @@ use serde_json::{json, Value}; use std::collections::HashMap; use std::fs; use std::fs::File; -use std::path::Path; +use std::path::{Path, PathBuf}; use walkdir::WalkDir; #[derive(PartialEq, Debug)] @@ -35,25 +35,25 @@ pub enum ExtractionJobType { #[derive(Debug)] pub struct FileToBeProcessed { - pub file_path: String, - pub output_path: String, + pub file_path: PathBuf, + pub output_path: PathBuf, pub job_type_suffix: String, } #[derive(Debug)] pub struct ExtractionJob { - pub input_path: String, + pub input_path: PathBuf, pub input_path_type: PathType, pub job_type: ExtractionJobType, pub files_to_be_processed: Vec, - pub output_path: String, // Remove - Kept for backwards compat + pub output_path: PathBuf, // Remove - Kept for backwards compat } impl std::fmt::Display for ExtractionJob { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, - "bin_path: {} p_type: {:?} what_do: {:?}", + "bin_path: {:?} p_type: {:?} what_do: {:?}", self.input_path, self.input_path_type, self.job_type ) } @@ -203,16 +203,20 @@ impl std::fmt::Display for AFLJFuncDetails { impl From<(String, String, String)> for FileToBeProcessed { fn from(orig: (String, String, String)) -> FileToBeProcessed { FileToBeProcessed { - file_path: orig.0, - output_path: orig.1, + file_path: PathBuf::from(orig.0), + output_path: PathBuf::from(orig.1), job_type_suffix: orig.2, } } } impl ExtractionJob { - pub fn new(input_path: &str, output_path: &str, mode: &str) -> Result { - fn get_path_type(bin_path: &str) -> PathType { + pub fn new( + input_path: &PathBuf, + output_path: &PathBuf, + mode: &str, + ) -> Result { + fn get_path_type(bin_path: &PathBuf) -> PathType { let fpath_md = fs::metadata(bin_path).unwrap(); let p_type: PathType; if fpath_md.is_file() { @@ -244,40 +248,46 @@ impl ExtractionJob { if p_type == PathType::File { let file = FileToBeProcessed { - file_path: input_path.to_string(), - output_path: output_path.to_string(), + file_path: input_path.to_owned(), + output_path: output_path.to_owned(), job_type_suffix: (*mode).to_string(), }; Ok(ExtractionJob { - input_path: input_path.to_string(), + input_path: input_path.to_owned(), input_path_type: p_type, job_type, files_to_be_processed: vec![file], - output_path: (*output_path).to_string(), + output_path: output_path.to_owned(), }) } else if p_type == PathType::Dir { - let files = ExtractionJob::get_file_paths_dir(input_path.to_string()); + let files = ExtractionJob::get_file_paths_dir(input_path); let files_with_output_path: Vec<(String, String, String)> = files .into_iter() - .map(|f| (f, output_path.to_string(), mode.to_string())) + .map(|f| { + ( + f, + output_path.to_string_lossy().to_string(), + mode.to_string(), + ) + }) .collect(); let files_to_be_processed: Vec = files_with_output_path .into_iter() .map(FileToBeProcessed::from) .collect(); Ok(ExtractionJob { - input_path: input_path.to_string(), + input_path: input_path.to_owned(), input_path_type: p_type, job_type, files_to_be_processed, - output_path: output_path.to_string(), + output_path: output_path.to_owned(), }) } else { bail!("Failed to create extraction job.") } } - fn get_file_paths_dir(input_path: String) -> Vec { + fn get_file_paths_dir(input_path: &PathBuf) -> Vec { let mut str_vec: Vec = Vec::new(); for file in WalkDir::new(input_path) .into_iter() @@ -326,19 +336,19 @@ impl FileToBeProcessed { .to_string_lossy() .to_string(); fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); - let f_name = format!("{}/{}.json", &self.output_path, fp_filename); + let f_name = format!("{:?}/{}.json", &self.output_path, fp_filename); if !Path::new(&f_name).exists() { info!("{} not found. Continuing processing.", f_name); // This creates HUGE JSON files for each files // Approximately 40x file size to JSON let mut r2p = self.setup_r2_pipe(&self.file_path, debug); - info!("Executing agfj @@f on {}", self.file_path); + info!("Executing agfj @@f on {:?}", self.file_path); let mut json = r2p.cmd("agfj @@f").expect("Command failed.."); - info!("Closing r2p process for {}", self.file_path); + info!("Closing r2p process for {:?}", self.file_path); r2p.close(); - info!("Starting JSON fixup for {}", self.file_path); + info!("Starting JSON fixup for {:?}", self.file_path); // Fix JSON object json = json.replace("[]\n", ","); json = json.replace("}]\n[{", "}],\n[{"); @@ -347,7 +357,7 @@ impl FileToBeProcessed { json = json.replace("}]\n,]", "}]\n]"); json = json.replace("\n,,[{", "\n,[{"); json = json.replace("\n,,[{", "\n,[{"); - info!("JSON fixup finished for {}", self.file_path); + info!("JSON fixup finished for {:?}", self.file_path); if json != "[,]" { #[allow(clippy::expect_fun_call)] @@ -402,14 +412,15 @@ impl FileToBeProcessed { pub fn extract_function_info(&self, debug: &bool) { info!("Starting function metdata extraction"); - let mut fp_filename = Path::new(self.file_path.as_str()) + let mut fp_filename = self + .file_path .file_name() .expect("Unable to get filename") .to_string_lossy() .to_string(); fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); - let f_name = format!("{}/{}.json", self.output_path, fp_filename); + let f_name = format!("{:?}/{}.json", self.output_path, fp_filename); if !Path::new(&f_name).exists() { let mut r2p = self.setup_r2_pipe(&self.file_path, debug); let function_details = self.get_function_name_list(&mut r2p); @@ -439,7 +450,7 @@ impl FileToBeProcessed { info!("Getting function information from binary"); let json = r2p .cmd("aflj") - .expect(&format!("aflj command failed for {}", self.file_path)); + .expect(&format!("aflj command failed for {:?}", self.file_path)); let json_obj: Vec = serde_json::from_str(&json) .expect(&format!("Unable to convert to JSON object! - {}", json)); @@ -484,14 +495,15 @@ impl FileToBeProcessed { // Helper Functions fn write_to_json(&self, json_obj: &Value) { - let mut fp_filename = Path::new(self.file_path.as_str()) + let mut fp_filename = self + .file_path .file_name() .expect("Unable to get filename") .to_string_lossy() .to_string(); fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); - let f_name = format!("{}/{}.json", self.output_path, fp_filename); + let f_name = format!("{:?}/{}.json", self.output_path, fp_filename); serde_json::to_writer( &File::create(&f_name).expect("Unable to create file!"), &json_obj, @@ -504,7 +516,7 @@ impl FileToBeProcessed { .expect("failed to seek addr"); } - fn setup_r2_pipe(&self, s: &String, debug: &bool) -> R2Pipe { + fn setup_r2_pipe(&self, s: &PathBuf, debug: &bool) -> R2Pipe { // Setup R2 pipe with options and return it // Could be extended to include toggling of options // + more args? @@ -521,16 +533,18 @@ impl FileToBeProcessed { args: vec!["-e bin.cache=true", "-e log.level=0"], } }; - debug!("Attempting to create r2pipe using {}", s); + debug!("Attempting to create r2pipe using {:?}", s); let mut r2p = match R2Pipe::in_session() { Some(_) => R2Pipe::open().expect("Unable to open R2Pipe"), - None => R2Pipe::spawn(s, Some(opts)).expect("Failed to spawn new R2Pipe"), + None => { + R2Pipe::spawn(s.to_str().unwrap(), Some(opts)).expect("Failed to spawn new R2Pipe") + } }; - debug!("Executing 'aa' r2 command for {}", s); + debug!("Executing 'aa' r2 command for {:?}", s); r2p.cmd("aa") .expect("Unable to complete standard analysis!"); - debug!("'aa' r2 command complete for {}", s); + debug!("'aa' r2 command complete for {:?}", s); r2p } } diff --git a/src/files.rs b/src/files.rs index 08a61b2..e8b8150 100644 --- a/src/files.rs +++ b/src/files.rs @@ -16,7 +16,7 @@ use serde_json::json; use std::collections::HashMap; use std::fs::{read_to_string, File}; use std::io::{BufWriter, Write}; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::string::String; use std::sync::mpsc::channel; #[cfg(feature = "inference")] @@ -24,9 +24,9 @@ use std::sync::Arc; #[derive(Serialize, Deserialize, Debug)] pub struct AGFJFile { - pub filename: String, + pub filename: PathBuf, pub functions: Option>>, - pub output_path: String, + pub output_path: PathBuf, pub min_blocks: u16, pub feature_type: Option, pub architecture: Option, @@ -54,7 +54,7 @@ impl AGFJFile { #[allow(clippy::expect_fun_call)] // Kept in to ensure that the JSON decode error message is printed alongside the filename let json: Vec> = serde_json::from_str(&data).expect(&format!( - "Unable to load function data from {}", + "Unable to load function data from {:?}", self.filename )); @@ -139,11 +139,11 @@ impl AGFJFile { /// It is *not* suitable for doing any other sort of tasks such as Next Sentence /// Prediction (NSP) as there is not indication of where a basic block starts or ends. pub fn generate_random_bb_walk(mut self, esil: bool, pairs: bool) { - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); let fname_string = if esil { - format!("{}-esil-singles-rwdfs.txt", fname_string) + format!("{:?}-esil-singles-rwdfs.txt", fname_string) } else { - format!("{}-dis-singles-rwdfs.txt", fname_string) + format!("{:?}-dis-singles-rwdfs.txt", fname_string) }; if !Path::new(&fname_string).exists() { @@ -189,8 +189,8 @@ impl AGFJFile { /// Generates a single string which contains the ESIL representation of every /// instruction within a function pub fn generate_esil_func_strings(mut self) { - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); - let fname_string = format!("{}-efs.json", fname_string); + let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string = format!("{:?}-efs.json", fname_string); if !Path::new(&fname_string).exists() { self.load_and_deserialize() @@ -229,8 +229,8 @@ impl AGFJFile { pub fn generate_disasm_func_strings(mut self) { // This needs to be amended so that there is a AGFJFunc function // that returns a function as a func string. - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); - let fname_string = format!("{}-dfs.json", fname_string); + let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string = format!("{:?}-dfs.json", fname_string); if !Path::new(&fname_string).exists() { self.load_and_deserialize() @@ -254,9 +254,9 @@ impl AGFJFile { let map: HashMap<_, _> = fixed.into_iter().collect(); let json = json!(map); - let fname_string: String = + let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); - let fname_string = format!("{}-dfs.json", fname_string); + let fname_string = format!("{:?}-dfs.json", fname_string); serde_json::to_writer( &File::create(fname_string).expect("Failed to create writer"), @@ -273,11 +273,11 @@ impl AGFJFile { /// This ignores control flow and simple iterates the JSON objects from the top to /// the bottom. pub fn generate_linear_bb_walk(mut self, esil: bool) { - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); let fname_string = if esil { - format!("{}-esil-singles.txt", fname_string) + format!("{:?}-esil-singles.txt", fname_string) } else { - format!("{}-dis-singles.txt", fname_string) + format!("{:?}-dis-singles.txt", fname_string) }; if !Path::new(&fname_string).exists() { @@ -335,8 +335,8 @@ impl AGFJFile { } let json = json!(&func_feature_vectors); - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); - let fname_string = format!("{}-tiknib.json", fname_string); + let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string = format!("{:?}-tiknib.json", fname_string); serde_json::to_writer( &File::create(fname_string).expect("Failed to create writer"), &json, @@ -376,9 +376,9 @@ pub enum FunctionMetadataTypes { #[derive(Serialize, Deserialize, Debug)] pub struct AGCJFile { - pub filename: String, + pub filename: PathBuf, pub function_call_graphs: Option>, - pub output_path: String, + pub output_path: PathBuf, pub function_metadata: Option, pub include_unk: bool, } @@ -398,9 +398,9 @@ impl AGCJFile { #[derive(Serialize, Deserialize, Debug)] pub struct AFIJFile { - pub filename: String, + pub filename: PathBuf, pub function_info: Option>, - pub output_path: String, + pub output_path: PathBuf, } impl AFIJFile { @@ -426,8 +426,8 @@ impl AFIJFile { } pub fn subset_and_save(&mut self) { let func_info_subsets = self.subset(); - let fname_string: String = get_save_file_path(&self.filename, &self.output_path, None); - let filename = format!("{}-finfo-subset.json", fname_string); + let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let filename = format!("{:?}-finfo-subset.json", fname_string); serde_json::to_writer( &File::create(filename).expect("Failed to create writer"), &func_info_subsets, @@ -438,9 +438,9 @@ impl AFIJFile { #[derive(Serialize, Deserialize, Debug)] pub struct TikNibFuncMetaFile { - pub filename: String, + pub filename: PathBuf, pub function_info: Option>, - pub output_path: String, + pub output_path: PathBuf, } impl TikNibFuncMetaFile { diff --git a/src/main.rs b/src/main.rs index 35a4cc8..be8a36c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -93,7 +93,7 @@ enum GenerateSubCommands { Graphs { /// The path to a JSON file extracted using the command #[arg(short, long, value_name = "FILENAME")] - path: String, + path: PathBuf, /// The target data type #[arg(short, long, value_name = "DATA_TYPE", value_parser = clap::builder::PossibleValuesParser::new(["cfg", "cg", "onehopcg", "cgcallers", "onehopcgcallers"]) @@ -102,7 +102,7 @@ enum GenerateSubCommands { /// The output path for the processed Networkx graphs (1 per function) #[arg(short, long, value_name = "OUTPUT")] - output_path: String, + output_path: PathBuf, /// The type of features to generate per basic block (node) #[arg(short, long, value_name = "FEATURE_TYPE", value_parser = clap::builder::PossibleValuesParser::new(["gemini", "discovre", "dgis"]) @@ -143,7 +143,7 @@ enum GenerateSubCommands { /// Filepath to the AFIJ function metadata (For call graphs) #[arg(long)] - metadata_path: Option, + metadata_path: Option, /// Include unknown functions (For call graphs) #[arg(long, default_value = "false")] @@ -158,7 +158,7 @@ enum GenerateSubCommands { Nlp { /// The path to a JSON file extracted using the command #[arg(short, long, value_name = "FILENAME")] - path: String, + path: PathBuf, /// The type of data to be generated #[arg(short, long, value_name = "DATA_TYPE", value_parser = clap::builder::PossibleValuesParser::new(["esil", "disasm"]) @@ -171,7 +171,7 @@ enum GenerateSubCommands { /// The output path for the processed data #[arg(short, long, value_name = "OUTPUT_PATH")] - data_out_path: String, + data_out_path: PathBuf, /// The format of the output data #[arg(short, long, value_name = "FORMAT", value_parser = clap::builder::PossibleValuesParser::new(["single", "funcstring"]) @@ -194,10 +194,10 @@ enum GenerateSubCommands { Metadata { /// The path to an afji JSON file extracted using the command #[arg(short, long, value_name = "INPUT_PATH")] - input_path: String, + input_path: PathBuf, /// The path for the generated output #[arg(short, long, value_name = "OUTPUT_PATH")] - output_path: String, + output_path: PathBuf, /// Data Source Type #[arg(short, long, value_parser = clap::builder::PossibleValuesParser::new(["finfo", "agfj"]) .map(|s| s.parse::().unwrap()))] @@ -231,7 +231,7 @@ enum Commands { Info { /// The path to the target binary #[arg(short, long, value_name = "FILENAME")] - path: Option, + path: Option, }, /// Generate processed data from extracted raw data Generate { @@ -242,11 +242,11 @@ enum Commands { Extract { /// The path to the dir or binary to be processed #[arg(short, long, value_name = "DIR")] - fpath: String, + fpath: PathBuf, /// The path for the output directory #[arg(short, long, value_name = "DIR")] - output_dir: String, + output_dir: PathBuf, /// The extraction mode #[arg(short, long, value_name = "EXTRACT_MODE", value_parser = clap::builder::PossibleValuesParser::new(["finfo", "reg", "cfg", "xrefs","cg"]) @@ -295,11 +295,11 @@ enum DedupSubCommands { Cgs { /// The filename to dedup #[arg(short, long, value_name = "FILENAME")] - filename: String, + filename: PathBuf, /// Output path to save dedup corpus #[arg(short, long, value_name = "OUTPUT_PATH")] - output_path: String, + output_path: PathBuf, /// Number of threads to use with Rayon #[arg(short, long, value_name = "NUM_THREADS", default_value = "2")] @@ -319,11 +319,11 @@ enum DedupSubCommands { Esil { /// The filename to dedup #[arg(short, long, value_name = "FILENAME")] - filename: String, + filename: PathBuf, /// Output path to save dedup corpus #[arg(short, long, value_name = "OUTPUT_PATH")] - output_path: String, + output_path: PathBuf, /// Toggle to print statistics of number of functions before and after dedup #[arg(long, default_value = "false")] @@ -397,8 +397,8 @@ fn main() { warn!("The 'with_features' toggle is set but is not support for CFG generation. Will ignore.") }; - if !Path::new(path).exists() { - error!("{} does not exist!", path); + if path.exists() { + error!("{:?} does not exist!", path); exit(1) } info!("Chosen Graph Type: {}", graph_data_type); @@ -441,7 +441,7 @@ fn main() { { if file.path().to_string_lossy().ends_with(".json") { agfj_graph_statistical_features( - file.path().to_str().unwrap(), + file.path(), &min_blocks.unwrap(), output_path, feature_vec_type, @@ -483,26 +483,26 @@ fn main() { exit(1) }; let mut metadata = AFIJFile { - filename: metadata_path.clone().unwrap(), + filename: metadata_path.clone().unwrap().to_path_buf(), function_info: None, - output_path: "".to_string(), + output_path: PathBuf::new(), }; metadata .load_and_deserialize() .expect("Unable to load file"); let metadata_subset = metadata.subset(); AGCJFile { - filename: path.to_owned(), + filename: (*path).clone(), function_call_graphs: None, - output_path: output_path.to_owned(), + output_path: (*output_path).clone(), function_metadata: Some(metadata_subset), include_unk: *include_unk, } } else { AGCJFile { - filename: path.to_owned(), + filename: (*path).clone(), function_call_graphs: None, - output_path: output_path.to_owned(), + output_path: (*output_path).clone(), function_metadata: None, include_unk: *include_unk, } @@ -573,20 +573,20 @@ fn main() { debug!("Creating call graphs without any node features"); file_paths_vec.par_iter().progress().for_each(|path| { let suffix = graph_type.to_owned().to_string(); - let full_output_path = PathBuf::from(get_save_file_path( - path, + let full_output_path = get_save_file_path( + &PathBuf::from(path), output_path, Some(suffix), - )); + ); if !full_output_path.is_dir() { let mut file = AGCJFile { - filename: path.to_owned(), + filename: path.to_owned().parse().unwrap(), function_call_graphs: None, output_path: output_path.to_owned(), function_metadata: None, include_unk: *include_unk, }; - debug!("Proceissing {}", file.filename); + debug!("Proceissing {:?}", file.filename); file.load_and_deserialize() .expect("Unable to load and desearilize JSON"); @@ -668,29 +668,28 @@ fn main() { .zip(metadata_paths_vec) .collect::>(); - combined_cgs_metadata.par_iter().progress().for_each(|tup| { + combined_cgs_metadata.par_iter().progress().for_each(|(filepath, metapath)| { let suffix = format!("{}-meta", graph_type.to_owned()); - let full_output_path = - PathBuf::from(get_save_file_path(&tup.0, output_path, Some(suffix))); + let full_output_path = get_save_file_path(&PathBuf::from(filepath), output_path, Some(suffix)); if !full_output_path.is_dir() { let mut file = { let metadata: Option; if metadata_type.clone().unwrap() == *"finfo" { let mut metadata_file = AFIJFile { - filename: tup.1.clone(), + filename: PathBuf::from(metapath), function_info: None, - output_path: "".to_string(), + output_path: PathBuf::new(), }; - debug!("Attempting to load metadata file: {}", tup.1); + debug!("Attempting to load metadata file: {}", metapath); metadata_file .load_and_deserialize() .expect("Unable to load associated metadata file"); metadata = Some(metadata_file.subset()); } else if metadata_type.clone().unwrap() == *"tiknib" { let mut metadata_file = TikNibFuncMetaFile { - filename: tup.1.clone(), + filename: PathBuf::from(metapath), function_info: None, - output_path: "".to_string(), + output_path: PathBuf::new(), }; metadata_file.load_and_deserialize().expect("Unable to load associated metadata file"); @@ -700,14 +699,14 @@ fn main() { } AGCJFile { - filename: tup.0.to_owned(), + filename: PathBuf::from(filepath), function_call_graphs: None, output_path: output_path.to_owned(), function_metadata: metadata, include_unk: *include_unk, } }; - debug!("Attempting to load {}", file.filename); + debug!("Attempting to load {:?}", file.filename); file.load_and_deserialize() .expect("Unable to load and desearilize JSON"); @@ -753,7 +752,7 @@ fn main() { ); } } - debug!("Finished generating cgs + metadata for {}", file.filename); + debug!("Finished generating cgs + metadata for {:?}", file.filename); } else { info!("Skipping {} as already exists", full_output_path.to_string_lossy()) }}); @@ -783,7 +782,7 @@ fn main() { let mut file = AGFJFile { functions: None, filename: input_path.to_owned(), - output_path: output_path.to_string(), + output_path: output_path.to_owned(), min_blocks: 1, // Dummy feature_type: None, architecture: None, @@ -831,7 +830,7 @@ fn main() { let file = AGFJFile { functions: None, filename: path.to_owned(), - output_path: data_out_path.to_string(), + output_path: data_out_path.to_owned(), min_blocks: *min_blocks, feature_type: None, architecture: None, @@ -849,8 +848,8 @@ fn main() { for file in file_paths_vec.iter().progress() { let file = AGFJFile { functions: None, - filename: file.to_string(), - output_path: data_out_path.to_string(), + filename: PathBuf::from(file), + output_path: data_out_path.to_owned(), min_blocks: *min_blocks, feature_type: None, architecture: None, @@ -964,7 +963,7 @@ fn main() { info!("Extraction Job type: Function Info"); job.files_to_be_processed[0].extract_function_info(debug) } - info!("Extraction complete for {}", fpath) + info!("Extraction complete for {:?}", fpath) } } @@ -1004,7 +1003,7 @@ fn main() { .unwrap(); corpus.process_corpus(); } else { - error!("Filename provided does not exist! - {}", filename) + error!("Filename provided does not exist! - {:?}", filename) } } DedupSubCommands::Esil { diff --git a/src/processors.rs b/src/processors.rs index 3c52366..6d10c09 100644 --- a/src/processors.rs +++ b/src/processors.rs @@ -6,6 +6,7 @@ use crate::files::AGFJFile; #[cfg(feature = "inference")] use crate::inference::InferenceJob; use serde::{Deserialize, Serialize}; +use std::path::{Path, PathBuf}; #[cfg(feature = "inference")] use std::process::exit; #[cfg(feature = "inference")] @@ -41,9 +42,9 @@ struct EdgePair { #[allow(clippy::too_many_arguments)] #[cfg(feature = "inference")] pub fn agfj_graph_embedded_feats( - path: &str, + path: &Path, min_blocks: &u16, - output_path: &str, + output_path: &Path, feature_type: FeatureType, tokeniser_fp: &Option, model_fp: &Option, @@ -53,7 +54,7 @@ pub fn agfj_graph_embedded_feats( let file = AGFJFile { functions: None, filename: path.to_owned(), - output_path: output_path.to_string(), + output_path: output_path.to_owned(), min_blocks: *min_blocks, feature_type: Some(feature_type), architecture: None, @@ -86,15 +87,15 @@ pub fn agfj_graph_embedded_feats( } pub fn agfj_graph_statistical_features( - path: &str, + path: &Path, min_blocks: &u16, - output_path: &str, + output_path: &PathBuf, feature_type: FeatureType, ) { let mut file = AGFJFile { functions: None, filename: path.to_owned(), - output_path: output_path.to_string(), + output_path: output_path.to_owned(), min_blocks: *min_blocks, feature_type: Some(feature_type), architecture: None, diff --git a/src/utils.rs b/src/utils.rs index 77fa795..a1cd0df 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,5 +1,5 @@ use std::fs::create_dir_all; -use std::path::Path; +use std::path::PathBuf; use walkdir::WalkDir; /// Formats a save file path @@ -15,15 +15,15 @@ use walkdir::WalkDir; /// /// See agcj.rs for an example of this optional suffix being used pub fn get_save_file_path( - binary_path: &str, - output_path: &String, + binary_path: &PathBuf, + output_path: &PathBuf, optional_suffix: Option, -) -> String { +) -> PathBuf { debug!( "Building Filepath - Binary Path: {:?} Output Path: {:?}", binary_path, output_path ); - let file_name = Path::new(binary_path) + let file_name = binary_path .file_stem() .unwrap() .to_string_lossy() @@ -32,18 +32,24 @@ pub fn get_save_file_path( if optional_suffix.is_none() { let full_output_path = format!( "{}/{}", - output_path.strip_suffix('/').unwrap_or(output_path), + output_path + .to_string_lossy() + .strip_suffix('/') + .unwrap_or(output_path.as_os_str().to_str().unwrap()), file_name ); - full_output_path + PathBuf::from(full_output_path) } else { let full_output_path = format!( "{}/{}-{}", - output_path.strip_suffix('/').unwrap_or(output_path), + output_path + .to_string_lossy() + .strip_suffix('/') + .unwrap_or(output_path.as_os_str().to_str().unwrap()), file_name, optional_suffix.unwrap() ); - full_output_path + PathBuf::from(full_output_path) } } @@ -53,7 +59,7 @@ pub fn get_save_file_path( /// files present within identifying files ending in .json before /// returning a Vec where each string is an absolute path /// to a given JSON file -pub fn get_json_paths_from_dir(path: &String, identifier: Option) -> Vec { +pub fn get_json_paths_from_dir(path: &PathBuf, identifier: Option) -> Vec { let mut str_vec: Vec = Vec::new(); let pattern = if identifier.is_none() { ".json".to_string() @@ -72,8 +78,8 @@ pub fn get_json_paths_from_dir(path: &String, identifier: Option) -> Vec } /// Checks to see if a directory is prsent, if not creates -pub fn check_or_create_dir(full_output_path: &String) { - if !Path::new(full_output_path).is_dir() { +pub fn check_or_create_dir(full_output_path: &PathBuf) { + if !full_output_path.is_dir() { create_dir_all(full_output_path).expect("Unable to create directory!"); } } @@ -89,23 +95,23 @@ mod tests { // TESTS FOR SAVE PATH BUILDING #[test] fn test_get_save_file_path_1() { - let path: &str = "test_bin/hello.json"; - let output_path: String = String::from("processed_data/"); + let path: &PathBuf = &PathBuf::from("test_bin/hello.json"); + let output_path: &PathBuf = &PathBuf::from("processed_data/"); let output_path = get_save_file_path(path, &output_path, Some("cg".to_string())); - assert_eq!(output_path, String::from("processed_data/hello-cg")) + assert_eq!(output_path, PathBuf::from("processed_data/hello-cg")) } #[test] fn test_get_save_file_path_2() { - let path: &str = "test_bin/extra_dir/hello.json"; - let output_path: String = String::from("with_more/processed_data/"); - let output = get_save_file_path(path, &output_path, None); - assert_eq!(output, String::from("with_more/processed_data/hello")) + let path: &PathBuf = &PathBuf::from("test_bin/extra_dir/hello.json"); + let output_path: &PathBuf = &PathBuf::from("with_more/processed_data/"); + let output = get_save_file_path(path, output_path, None); + assert_eq!(output, PathBuf::from("with_more/processed_data/hello")) } #[test] fn test_get_save_file_path_3() { - let path: &str = "hello.json"; - let output_path: String = String::from("processed_data"); + let path: &PathBuf = &PathBuf::from("hello.json"); + let output_path: &PathBuf = &PathBuf::from("processed_data"); let output = get_save_file_path(path, &output_path, None); - assert_eq!(output, String::from("processed_data/hello")) + assert_eq!(output, PathBuf::from("processed_data/hello")) } } From 0b0f1a1e091414395be544e30620d0eab07ccaef Mon Sep 17 00:00:00 2001 From: Br0kej Date: Mon, 29 Jan 2024 13:19:21 +0000 Subject: [PATCH 05/40] get path type refactor to have implicit return --- src/extract.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/extract.rs b/src/extract.rs index f52253b..c254cc2 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -218,15 +218,13 @@ impl ExtractionJob { ) -> Result { fn get_path_type(bin_path: &PathBuf) -> PathType { let fpath_md = fs::metadata(bin_path).unwrap(); - let p_type: PathType; if fpath_md.is_file() { - p_type = PathType::File; + PathType::File } else if fpath_md.is_dir() { - p_type = PathType::Dir; + PathType::Dir } else { - p_type = PathType::Unk; + PathType::Unk } - p_type } // This functionality is currently not being used! From 4a45ba5f8198f55df1325c65dbc9e6d1408f4b16 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Mon, 29 Jan 2024 13:24:15 +0000 Subject: [PATCH 06/40] fixing result panic --- src/files.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/files.rs b/src/files.rs index e8b8150..a2a8593 100644 --- a/src/files.rs +++ b/src/files.rs @@ -51,18 +51,18 @@ impl AGFJFile { pub fn load_and_deserialize(&mut self) -> Result<(), ()> { let data = read_to_string(&self.filename).expect("Unable to read file"); - #[allow(clippy::expect_fun_call)] // Kept in to ensure that the JSON decode error message is printed alongside the filename - let json: Vec> = serde_json::from_str(&data).expect(&format!( - "Unable to load function data from {:?}", - self.filename - )); + let json = serde_json::from_str(&data); - self.functions = Some(json); + if json.is_ok() { + self.functions = Some(json.unwrap()); - self.architecture = self.detect_architecture(); + self.architecture = self.detect_architecture(); - Ok(()) + Ok(()) + } else { + Err(()) + } } /// Detects the architecture of a file by iterating through the functions From 334bb47d6b10cc7118e14d167141bb3324c869b2 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Mon, 29 Jan 2024 21:01:27 +0000 Subject: [PATCH 07/40] adding graceful panic handling so rayon par_iter isnt exited --- Cargo.toml | 2 +- src/afij.rs | 16 +++--- src/extract.rs | 150 +++++++++++++++++++++++++++++++------------------ 3 files changed, 103 insertions(+), 65 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f6cc545..47e6e7e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bin2ml" -version = "0.2.6" +version = "0.2.7" edition = "2021" [dependencies] diff --git a/src/afij.rs b/src/afij.rs index 18c6e43..58d2bdc 100644 --- a/src/afij.rs +++ b/src/afij.rs @@ -5,26 +5,26 @@ use serde_json::Value; #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct AFIJFunctionInfo { - pub offset: i64, + pub offset: u64, pub name: String, pub size: i128, #[serde(rename = "is-pure")] pub is_pure: String, - pub realsz: i64, + pub realsz: u64, pub noreturn: bool, - pub stackframe: i64, + pub stackframe: u64, pub calltype: String, - pub cost: i64, - pub cc: i64, - pub bits: i64, + pub cost: u64, + pub cc: u64, + pub bits: u64, #[serde(rename = "type")] pub type_field: String, - pub nbbs: i64, + pub nbbs: u64, #[serde(rename = "is-lineal")] pub is_lineal: bool, pub ninstrs: i64, pub edges: i64, - pub ebbs: i64, + pub ebbs: u64, pub signature: String, pub minbound: u64, pub maxbound: i128, diff --git a/src/extract.rs b/src/extract.rs index c254cc2..82ae08d 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -308,22 +308,30 @@ impl FileToBeProcessed { info!("Starting register behaviour extraction"); let mut r2p = self.setup_r2_pipe(&self.file_path, debug); let function_details = self.get_function_name_list(&mut r2p); - let mut register_behaviour_vec: HashMap = HashMap::new(); - info!("Executing aeafj for each function"); - for function in function_details.iter() { - r2p.cmd(format!("s @ {}", &function.name).as_str()) - .expect("Command failed.."); - let json = r2p.cmd("aeafj").expect("Command failed.."); - let json_obj: AEAFJRegisterBehaviour = - serde_json::from_str(&json).expect("Unable to convert to JSON object!"); - register_behaviour_vec.insert(function.name.clone(), json_obj); - } - info!("All functions processed"); - r2p.close(); - info!("r2p closed"); + if function_details.is_ok() { + let mut register_behaviour_vec: HashMap = + HashMap::new(); + info!("Executing aeafj for each function"); + for function in function_details.unwrap().iter() { + r2p.cmd(format!("s @ {}", &function.name).as_str()) + .expect("Command failed.."); + let json = r2p.cmd("aeafj").expect("Command failed.."); + let json_obj: AEAFJRegisterBehaviour = + serde_json::from_str(&json).expect("Unable to convert to JSON object!"); + register_behaviour_vec.insert(function.name.clone(), json_obj); + } + info!("All functions processed"); + r2p.close(); + info!("r2p closed"); - info!("Writing extracted data to file"); - self.write_to_json(&json!(register_behaviour_vec)) + info!("Writing extracted data to file"); + self.write_to_json(&json!(register_behaviour_vec)) + } else { + error!( + "Failed to extract function register - Error in r2 extraction for {:?}", + self.file_path + ) + } } // TODO: Refactor this so it uses the AGFJ struct @@ -396,16 +404,23 @@ impl FileToBeProcessed { let function_details = self.get_function_name_list(&mut r2p); let mut function_xrefs: HashMap> = HashMap::new(); info!("Extracting xrefs for each function"); - for function in function_details.iter() { - let ret = self.get_function_xref_details(function.offset, &mut r2p); - function_xrefs.insert(function.name.clone(), ret); - } - info!("All functions processed"); - r2p.close(); - info!("r2p closed"); + if function_details.is_ok() { + for function in function_details.unwrap().iter() { + let ret = self.get_function_xref_details(function.offset, &mut r2p); + function_xrefs.insert(function.name.clone(), ret); + } + info!("All functions processed"); + r2p.close(); + info!("r2p closed"); - info!("Writing extracted data to file"); - self.write_to_json(&json!(function_xrefs)) + info!("Writing extracted data to file"); + self.write_to_json(&json!(function_xrefs)) + } else { + error!( + "Failed to extract function xrefs - Error in r2 extraction for {:?}", + self.file_path + ) + } } pub fn extract_function_info(&self, debug: &bool) { @@ -421,38 +436,44 @@ impl FileToBeProcessed { let f_name = format!("{:?}/{}.json", self.output_path, fp_filename); if !Path::new(&f_name).exists() { let mut r2p = self.setup_r2_pipe(&self.file_path, debug); - let function_details = self.get_function_name_list(&mut r2p); - let mut function_info: Vec> = Vec::new(); - info!("Extracting function metadata"); - for function in function_details.iter() { - debug!("Processing {}", function.name); - let ret = self.get_function_info(function.offset, &mut r2p); - debug!("Metadata Collected: {:?}", ret); - function_info.push(ret); - } - info!("All functions processed"); - r2p.close(); - info!("r2p closed"); - info!("Writing extracted data to file"); - self.write_to_json(&json!(function_info - .into_iter() - .flatten() - .collect::>())) + let function_details: Result, r2pipe::Error> = + self.get_function_name_list(&mut r2p); + + if function_details.is_err() { + error!("Unable to extract function info for {:?}", self.file_path); + r2p.close(); + info!("r2p closed"); + } else { + r2p.close(); + info!("r2p closed"); + + info!("Writing extracted data to file"); + self.write_to_json(&json!(function_details.unwrap())) + } } } // r2 commands to structs - fn get_function_name_list(&self, r2p: &mut R2Pipe) -> Vec { + fn get_function_name_list( + &self, + r2p: &mut R2Pipe, + ) -> Result, r2pipe::Error> { info!("Getting function information from binary"); - let json = r2p - .cmd("aflj") - .expect(&format!("aflj command failed for {:?}", self.file_path)); - let json_obj: Vec = serde_json::from_str(&json) - .expect(&format!("Unable to convert to JSON object! - {}", json)); + let json = r2p.cmd("aflj"); - json_obj + if json.is_ok() { + let json_obj: Vec = serde_json::from_str(&json.as_ref().unwrap()) + .expect(&format!( + "Unable to convert to JSON object! - {}", + json.unwrap() + )); + + Ok(json_obj) + } else { + Err(json.unwrap_err()) + } } fn get_function_xref_details( @@ -482,12 +503,24 @@ impl FileToBeProcessed { json_obj } - fn get_function_info(&self, function_addr: u64, r2p: &mut R2Pipe) -> Vec { + fn get_function_info( + &self, + function_addr: u64, + r2p: &mut R2Pipe, + ) -> Result, r2pipe::Error> { Self::go_to_address(r2p, function_addr); - let json = r2p.cmd("afij").expect("afij command failed"); - let json_obj: Vec = serde_json::from_str(&json) - .expect(&format!("Unable to convert to JSON object! - {}", json)); - json_obj + let json = r2p.cmd("afij"); + if json.is_ok() { + let json_obj: Vec = serde_json::from_str(&json.as_ref().unwrap()) + .expect(&format!( + "Unable to convert to JSON object! - {}", + json.unwrap() + )); + + Ok(json_obj) + } else { + Err(json.unwrap_err()) + } } // Helper Functions @@ -501,12 +534,17 @@ impl FileToBeProcessed { .to_string(); fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); - let f_name = format!("{:?}/{}.json", self.output_path, fp_filename); + + let mut output_filepath = PathBuf::new(); + output_filepath.push(self.output_path.clone()); + output_filepath.push(fp_filename); + debug!("Save filename: {:?}", output_filepath); + serde_json::to_writer( - &File::create(&f_name).expect("Unable to create file!"), + &File::create(&output_filepath).expect("Unable to create file!"), &json_obj, ) - .unwrap_or_else(|_| panic!("the world is ending: {}", f_name)); + .unwrap_or_else(|_| panic!("the world is ending: {:?}", output_filepath)); } fn go_to_address(r2p: &mut R2Pipe, function_addr: u64) { From 18f11742c2e905aafb7c7cc581a2050f5db5e001 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sat, 3 Feb 2024 19:01:54 +0000 Subject: [PATCH 08/40] fixing condition to be correct --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index be8a36c..d4966ab 100644 --- a/src/main.rs +++ b/src/main.rs @@ -397,7 +397,7 @@ fn main() { warn!("The 'with_features' toggle is set but is not support for CFG generation. Will ignore.") }; - if path.exists() { + if !path.exists() { error!("{:?} does not exist!", path); exit(1) } From 93586f760f2d4b16b1f544a5858e569b8ad41e44 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 4 Feb 2024 19:02:08 +0000 Subject: [PATCH 09/40] fixing saving filepath formats --- src/agcj.rs | 14 +++++++------- src/extract.rs | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/agcj.rs b/src/agcj.rs index ddab388..e6fbb2a 100644 --- a/src/agcj.rs +++ b/src/agcj.rs @@ -93,10 +93,10 @@ impl AGCJFunctionCallGraphs { networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { - let full_output_path = + let mut full_output_path = get_save_file_path(binary_name, output_path, Some(type_suffix.to_string())); check_or_create_dir(&full_output_path); - + debug!("Built Path: {:?}", full_output_path); let mut function_name = self.name.clone(); // This is a pretty dirty fix and may break things @@ -104,13 +104,13 @@ impl AGCJFunctionCallGraphs { function_name = self.name[..75].to_string(); } - let filename = format!( - "{:?}/{}-{}.json", - full_output_path, function_name, type_suffix - ); + let filename = format!("{}-{}.json", function_name, type_suffix); + full_output_path.push(filename); + + debug!("Attempting to save to {:?}", full_output_path); serde_json::to_writer( - &File::create(filename).expect("Failed to create writer"), + &File::create(full_output_path).expect("Failed to create writer"), &networkx_graph, ) .expect("Unable to write JSON"); diff --git a/src/extract.rs b/src/extract.rs index 82ae08d..2c7a47a 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -533,7 +533,7 @@ impl FileToBeProcessed { .to_string_lossy() .to_string(); - fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); + fp_filename = fp_filename + "_" + &self.job_type_suffix.clone() + ".json"; let mut output_filepath = PathBuf::new(); output_filepath.push(self.output_path.clone()); From ae6d4c1a71e1376984b6cc301141fe65bd7c9d9d Mon Sep 17 00:00:00 2001 From: Br0kej Date: Tue, 6 Feb 2024 20:06:02 +0000 Subject: [PATCH 10/40] refactor to support pathbufs properly and add inplace dedup method for cgs --- src/dedup.rs | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++-- src/main.rs | 11 ++++++++- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/src/dedup.rs b/src/dedup.rs index b06eb48..feff145 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -303,7 +303,7 @@ impl CGCorpus { filepath_format: &String, node_type: CallGraphNodeFeatureType, ) -> Result { - if output_path.exists() { + if !output_path.exists() { fs::create_dir(output_path).expect("Failed to create output directory!"); info!("Output path not found - Creating {:?}", output_path) } @@ -359,6 +359,26 @@ impl CGCorpus { } } + fn dedup_corpus_inplace(data: &mut Vec>, filepaths: &mut Vec) { + info!("Deduplicating inplace!"); + + let mut seen = HashSet::new(); + for (i, data_ele) in data.iter().enumerate() { + let hash_value = Self::calculate_hash(&data_ele); + + if seen.contains(&hash_value) { + let ret = fs::remove_file(&filepaths[i]); + if ret.is_ok() { + debug!("Sucessfully removed {:?}", ret.unwrap()) + } else { + error!("Unable to remove - {:?}", ret); + } + } else { + seen.insert(hash_value); + } + } + } + fn get_binary_name_cisco(filepath: &PathBuf) -> PathBuf { // Example: x86-gcc-9-O3_nping_cg-onehopcgcallers-meta let binary_intermediate = Path::new(filepath).parent().unwrap().file_name().unwrap(); @@ -460,6 +480,25 @@ impl CGCorpus { }); } + pub fn process_corpus_inplace(&self) { + let fp_binaries = self.extract_binary_from_fps(); + + // Generate binary specific filepath vectors + let mut unique_binaries_fps = self.get_unique_binary_fps(fp_binaries); + + info!("Loading the filepaths"); + unique_binaries_fps + .par_iter_mut() + .progress() + .enumerate() + .for_each(|(idx, fp_subset)| { + let mut subset_loaded_data: Vec> = + self.load_subset(fp_subset); + debug!("Starting to deduplicate the corpus - {}", idx); + Self::dedup_corpus_inplace(&mut subset_loaded_data, fp_subset); + }); + } + fn generate_dedup_filepath(output_path: &PathBuf, filepath: &PathBuf) -> PathBuf { let first_two = filepath.components().rev().take(2).collect::>(); let first_two: PathBuf = first_two.iter().rev().collect(); @@ -522,12 +561,18 @@ mod tests { "cisco".to_string() ); + // clean up + if corpus.unwrap().output_path.is_dir() { + fs::remove_dir_all(&corpus.unwrap().output_path).expect("Unable to remove directory!"); + }; + let corpus = CGCorpus::new( &PathBuf::from("test-files/cg_dedup/to_dedup"), &PathBuf::from("test-files/cg_dedup/deduped/"), &"cisco".to_string(), CallGraphNodeFeatureType::CGName, ); + assert_eq!(corpus.as_ref().unwrap().filepaths.len(), 12); assert_eq!( corpus.as_ref().unwrap().output_path, @@ -537,6 +582,10 @@ mod tests { corpus.as_ref().unwrap().filepath_format, "cisco".to_string() ); + // clean up + if corpus.unwrap().output_path.is_dir() { + fs::remove_dir_all(&corpus.unwrap().output_path).expect("Unable to remove directory!"); + } } #[test] @@ -566,7 +615,11 @@ mod tests { PathBuf::from("testbin2"), PathBuf::from("testbin2"), ] - ) + ); + // clean up + if corpus.unwrap().output_path.is_dir() { + fs::remove_dir_all(&corpus.unwrap().output_path).expect("Unable to remove directory!"); + } } #[test] @@ -578,12 +631,18 @@ mod tests { CallGraphNodeFeatureType::CGMeta, ) .unwrap(); + let fp_binaries = corpus.extract_binary_from_fps(); let unique_binary_fps = corpus.get_unique_binary_fps(fp_binaries); assert_eq!(unique_binary_fps.len(), 2); assert_eq!(unique_binary_fps[0].len(), 8); assert_eq!(unique_binary_fps[1].len(), 4); + + // clean up + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + } } #[test] @@ -604,6 +663,11 @@ mod tests { assert_eq!(subset_loaded.len(), 8); subset_loaded.retain(|c| c.is_some()); assert_eq!(subset_loaded.len(), 8); + + // clean up + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + } } #[test] diff --git a/src/main.rs b/src/main.rs index d4966ab..a553fa7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -314,6 +314,10 @@ enum DedupSubCommands { #[arg(long,value_parser = clap::builder::PossibleValuesParser::new(["cgmeta", "cgname", "tiknib"]) .map(|s| s.parse::().unwrap()), required = true)] node_feature_type: String, + + /// Toggle to remove inplace (i.e delete duplicates) + #[arg(long)] + inplace: bool, }, /// De-dup generate ESIL strings Esil { @@ -988,6 +992,7 @@ fn main() { num_threads, filepath_format, node_feature_type, + inplace, } => { rayon::ThreadPoolBuilder::new() .num_threads(*num_threads) @@ -1001,7 +1006,11 @@ fn main() { let corpus = CGCorpus::new(filename, output_path, filepath_format, node_feature_type) .unwrap(); - corpus.process_corpus(); + if *inplace { + corpus.process_corpus_inplace(); + } else { + corpus.process_corpus(); + } } else { error!("Filename provided does not exist! - {:?}", filename) } From bea1004504932c799d190c29fbfa4cf5f751a17d Mon Sep 17 00:00:00 2001 From: Br0kej Date: Tue, 6 Feb 2024 20:29:44 +0000 Subject: [PATCH 11/40] add empty dir removal after dedup --- src/dedup.rs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/dedup.rs b/src/dedup.rs index feff145..066b14c 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -9,7 +9,7 @@ use serde::{Deserialize, Serialize}; use serde_json::json; use std::collections::hash_map::DefaultHasher; use std::collections::{HashMap, HashSet}; -use std::fs::{read_to_string, File}; +use std::fs::{read_dir, read_to_string, File}; use std::hash::{Hash, Hasher}; use std::path::{Path, PathBuf}; use std::string::String; @@ -497,6 +497,31 @@ impl CGCorpus { debug!("Starting to deduplicate the corpus - {}", idx); Self::dedup_corpus_inplace(&mut subset_loaded_data, fp_subset); }); + + Self::clean_up_empty_dirs(&self.output_path); + } + + fn clean_up_empty_dirs(output_path: &PathBuf) { + for dir in WalkDir::new(output_path) + .into_iter() + .filter_map(|file| file.ok()) + { + if dir.path().is_dir() { + let path = dir.path(); + let dir_ret = read_dir(path); + if dir_ret.is_ok() { + let is_empty = dir_ret.unwrap().next().is_none(); + if is_empty { + let ret = fs::remove_dir(dir.path()); + if ret.is_ok() { + debug!("Successfully removed {:?}", dir.path()); + } else { + error!("Tried to remove {:?} but failed", dir.path()); + } + }; + } + } + } } fn generate_dedup_filepath(output_path: &PathBuf, filepath: &PathBuf) -> PathBuf { From dfd5c6d8a44d4c4f385a490bc5598257fe4f9d25 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Tue, 6 Feb 2024 21:33:01 +0000 Subject: [PATCH 12/40] adding functionality to create sub chunks of large subsets whilst deduplicating! --- src/dedup.rs | 36 +++++++++++++++++++++++++++++------- src/main.rs | 1 - 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/dedup.rs b/src/dedup.rs index 066b14c..f6468c7 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -360,8 +360,6 @@ impl CGCorpus { } fn dedup_corpus_inplace(data: &mut Vec>, filepaths: &mut Vec) { - info!("Deduplicating inplace!"); - let mut seen = HashSet::new(); for (i, data_ele) in data.iter().enumerate() { let hash_value = Self::calculate_hash(&data_ele); @@ -369,7 +367,7 @@ impl CGCorpus { if seen.contains(&hash_value) { let ret = fs::remove_file(&filepaths[i]); if ret.is_ok() { - debug!("Sucessfully removed {:?}", ret.unwrap()) + debug!("Sucessfully removed graph"); } else { error!("Unable to remove - {:?}", ret); } @@ -492,10 +490,34 @@ impl CGCorpus { .progress() .enumerate() .for_each(|(idx, fp_subset)| { - let mut subset_loaded_data: Vec> = - self.load_subset(fp_subset); - debug!("Starting to deduplicate the corpus - {}", idx); - Self::dedup_corpus_inplace(&mut subset_loaded_data, fp_subset); + debug!("Subset Length: {}", fp_subset.len()); + if fp_subset.len() > 2500000 { + info!("Encountered a binary subset with more than 2.5M graphs. Chunking. Will have to repeat!"); + let mut chunked: Vec<_> = fp_subset + .chunks(1000000) + .map(|s| { + let mut inner_vec = Vec::new(); + for ele in s { + inner_vec.push(ele.to_owned()); + } + inner_vec + }) + .collect(); + + info!("Created {} chunks of 1M (approx.)", chunked.len()); + for (i, ele) in chunked.iter_mut().enumerate() { + info!("Processing Chunk {}", i); + let mut subset_loaded_data: Vec> = + self.load_subset(ele); + debug!("Starting to deduplicate the corpus - {}", idx); + Self::dedup_corpus_inplace(&mut subset_loaded_data, ele); + } + } else { + let mut subset_loaded_data: Vec> = + self.load_subset(fp_subset); + debug!("Starting to deduplicate the corpus - {}", idx); + Self::dedup_corpus_inplace(&mut subset_loaded_data, fp_subset); + } }); Self::clean_up_empty_dirs(&self.output_path); diff --git a/src/main.rs b/src/main.rs index a553fa7..0ad10c9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -999,7 +999,6 @@ fn main() { .build_global() .unwrap(); - warn!("This only supports the Cisco Talos Binary Sim Dataset naming convention"); if Path::new(filename).exists() { let node_feature_type = CallGraphNodeFeatureType::new(node_feature_type); info!("Starting duplication process for One Hop Call Graphs"); From c3e4dfe4fc39b81f6e9deeedbea6d525044a8afe Mon Sep 17 00:00:00 2001 From: Br0kej Date: Tue, 6 Feb 2024 21:48:49 +0000 Subject: [PATCH 13/40] upping chunk size + threshold --- src/dedup.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/dedup.rs b/src/dedup.rs index f6468c7..853fb06 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -491,10 +491,10 @@ impl CGCorpus { .enumerate() .for_each(|(idx, fp_subset)| { debug!("Subset Length: {}", fp_subset.len()); - if fp_subset.len() > 2500000 { + if fp_subset.len() > 3500000 { info!("Encountered a binary subset with more than 2.5M graphs. Chunking. Will have to repeat!"); let mut chunked: Vec<_> = fp_subset - .chunks(1000000) + .chunks(2000000) .map(|s| { let mut inner_vec = Vec::new(); for ele in s { @@ -504,9 +504,8 @@ impl CGCorpus { }) .collect(); - info!("Created {} chunks of 1M (approx.)", chunked.len()); + info!("Created {} chunks of 2M (approx.)", chunked.len()); for (i, ele) in chunked.iter_mut().enumerate() { - info!("Processing Chunk {}", i); let mut subset_loaded_data: Vec> = self.load_subset(ele); debug!("Starting to deduplicate the corpus - {}", idx); From 97a5e857a535107da8b5e18f1e54440cdcee11ad Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 18 Feb 2024 10:21:36 +0000 Subject: [PATCH 14/40] handling panic when faced with malformed JSON --- src/dedup.rs | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/dedup.rs b/src/dedup.rs index 853fb06..e253e87 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -436,19 +436,23 @@ impl CGCorpus { for ele in fp_subset.iter() { let data = read_to_string(ele).expect(&format!("Unable to read file - {:?}", ele)); - let json = serde_json::from_str::(&data) - .expect(&format!("Unable to load function data from {:?}", ele)); - - let nodes_empty = match self.node_type { - CallGraphNodeFeatureType::CGName => json.as_cg_name().unwrap().nodes.is_empty(), - CallGraphNodeFeatureType::CGMeta => json.as_cg_meta().unwrap().nodes.is_empty(), - CallGraphNodeFeatureType::TikNib => json.as_tik_nib().unwrap().nodes.is_empty(), - }; + let json = serde_json::from_str::(&data); + + if json.is_ok() { + let json = json.unwrap(); + let nodes_empty = match self.node_type { + CallGraphNodeFeatureType::CGName => json.as_cg_name().unwrap().nodes.is_empty(), + CallGraphNodeFeatureType::CGMeta => json.as_cg_meta().unwrap().nodes.is_empty(), + CallGraphNodeFeatureType::TikNib => json.as_tik_nib().unwrap().nodes.is_empty(), + }; - if !nodes_empty { - subset_loaded_data.push(Some(json)) + if !nodes_empty { + subset_loaded_data.push(Some(json)) + } else { + subset_loaded_data.push(None) + } } else { - subset_loaded_data.push(None) + error!("Unable to load {:?}", ele); } } subset_loaded_data From 8bdff687ad2610f284939f61c7714d5d88b87a34 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 18 Feb 2024 21:16:00 +0000 Subject: [PATCH 15/40] adding support for extended versions of the finfo subset --- src/afij.rs | 32 ++++++++++++++++++++++++++++++++ src/files.rs | 33 ++++++++++++++++++++++----------- src/main.rs | 10 +++++++--- 3 files changed, 61 insertions(+), 14 deletions(-) diff --git a/src/afij.rs b/src/afij.rs index 58d2bdc..eb43764 100644 --- a/src/afij.rs +++ b/src/afij.rs @@ -1,3 +1,4 @@ +use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize}; use serde_aux::prelude::*; use serde_json::Value; @@ -125,3 +126,34 @@ impl From<&AFIJFunctionInfo> for AFIJFeatureSubset { } } } + +#[derive(Default, Debug, Clone, PartialEq, Hash, Serialize, Deserialize)] +pub struct AFIJFeatureSubsetExtended { + pub name: String, + pub ninstrs: i64, + pub edges: i64, + pub indegree: i64, + pub outdegree: i64, + pub nlocals: i64, + pub nargs: i64, + pub nbbs: u64, + pub avg_ins_bb: OrderedFloat, +} + +impl From<&AFIJFunctionInfo> for AFIJFeatureSubsetExtended { + fn from(src: &AFIJFunctionInfo) -> AFIJFeatureSubsetExtended { + let avg_ins_bbs = OrderedFloat::from(src.ninstrs as f32 / src.nbbs as f32); + + AFIJFeatureSubsetExtended { + name: src.name.clone(), + ninstrs: src.ninstrs, + edges: src.edges, + indegree: src.indegree.unwrap_or(0), + outdegree: src.outdegree.unwrap_or(0), + nlocals: src.nlocals.unwrap_or(0), + nargs: src.nargs.unwrap_or(0), + nbbs: src.nbbs, + avg_ins_bb: avg_ins_bbs, + } + } +} diff --git a/src/files.rs b/src/files.rs index a2a8593..ce98a7e 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1,4 +1,4 @@ -use crate::afij::{AFIJFeatureSubset, AFIJFunctionInfo}; +use crate::afij::{AFIJFeatureSubset, AFIJFeatureSubsetExtended, AFIJFunctionInfo}; use crate::agcj::AGCJFunctionCallGraphs; use crate::agfj::{AGFJFunc, TikNibFunc}; use crate::bb::{FeatureType, InstructionMode}; @@ -371,6 +371,7 @@ impl AGFJFile { #[serde(untagged)] pub enum FunctionMetadataTypes { AFIJ(Vec), + AFIJExtended(Vec), AGFJ(Vec), } @@ -415,19 +416,29 @@ impl AFIJFile { Ok(()) } - pub fn subset(&mut self) -> FunctionMetadataTypes { - let mut func_info_subsets: Vec = Vec::new(); - debug!("Starting to subset functions"); - for function in self.function_info.as_ref().unwrap().iter() { - let subset = AFIJFeatureSubset::from(function); - func_info_subsets.push(subset) + pub fn subset(&mut self, extended: bool) -> FunctionMetadataTypes { + if extended { + let mut func_info_subsets_extended: Vec = Vec::new(); + debug!("Starting to subset functions"); + for function in self.function_info.as_ref().unwrap().iter() { + let subset = AFIJFeatureSubsetExtended::from(function); + func_info_subsets_extended.push(subset) + } + FunctionMetadataTypes::AFIJExtended(func_info_subsets_extended) + } else { + let mut func_info_subsets: Vec = Vec::new(); + debug!("Starting to subset functions"); + for function in self.function_info.as_ref().unwrap().iter() { + let subset = AFIJFeatureSubset::from(function); + func_info_subsets.push(subset) + } + FunctionMetadataTypes::AFIJ(func_info_subsets) } - FunctionMetadataTypes::AFIJ(func_info_subsets) } - pub fn subset_and_save(&mut self) { - let func_info_subsets = self.subset(); + pub fn subset_and_save(&mut self, extended: bool) { + let func_info_subsets = self.subset(extended); let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); - let filename = format!("{:?}-finfo-subset.json", fname_string); + let filename = format!("{}-finfo-subset.json", fname_string.to_string_lossy()); serde_json::to_writer( &File::create(filename).expect("Failed to create writer"), &func_info_subsets, diff --git a/src/main.rs b/src/main.rs index 0ad10c9..6ccbb00 100644 --- a/src/main.rs +++ b/src/main.rs @@ -202,6 +202,9 @@ enum GenerateSubCommands { #[arg(short, long, value_parser = clap::builder::PossibleValuesParser::new(["finfo", "agfj"]) .map(|s| s.parse::().unwrap()))] data_source_type: String, + /// Toggle for extended version of finfo + #[arg(short, long)] + extended: bool, }, /// Generate tokenisers from extracted data Tokeniser { @@ -494,7 +497,7 @@ fn main() { metadata .load_and_deserialize() .expect("Unable to load file"); - let metadata_subset = metadata.subset(); + let metadata_subset = metadata.subset(false); AGCJFile { filename: (*path).clone(), function_call_graphs: None, @@ -688,7 +691,7 @@ fn main() { metadata_file .load_and_deserialize() .expect("Unable to load associated metadata file"); - metadata = Some(metadata_file.subset()); + metadata = Some(metadata_file.subset(false)); } else if metadata_type.clone().unwrap() == *"tiknib" { let mut metadata_file = TikNibFuncMetaFile { filename: PathBuf::from(metapath), @@ -768,6 +771,7 @@ fn main() { input_path, output_path, data_source_type, + extended, } => { if data_source_type == "finfo" { let mut file = AFIJFile { @@ -779,7 +783,7 @@ fn main() { file.load_and_deserialize() .expect("Unable to load and desearilize JSON"); info!("Successfully loaded JSON"); - file.subset_and_save(); + file.subset_and_save(*extended); info!("Generation complete"); } else if data_source_type == "agfj" { warn!("This currently only supports making TikNib features for single files"); From c32f6f3cc62e385e45391b4d98afe03918023704 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 18 Feb 2024 21:20:01 +0000 Subject: [PATCH 16/40] fixing save filepath for tiknib metadata --- src/files.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/files.rs b/src/files.rs index ce98a7e..2fab1a1 100644 --- a/src/files.rs +++ b/src/files.rs @@ -336,7 +336,7 @@ impl AGFJFile { let json = json!(&func_feature_vectors); let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); - let fname_string = format!("{:?}-tiknib.json", fname_string); + let fname_string = format!("{}-tiknib.json", fname_string.to_string_lossy()); serde_json::to_writer( &File::create(fname_string).expect("Failed to create writer"), &json, From 282c8c40b4bf589a3e904af2b582f52cf1f5ce99 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 18 Feb 2024 21:21:03 +0000 Subject: [PATCH 17/40] resolving clippy warnings --- src/dedup.rs | 16 +++++++--------- src/extract.rs | 20 -------------------- 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/src/dedup.rs b/src/dedup.rs index e253e87..e06b3e0 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -509,7 +509,7 @@ impl CGCorpus { .collect(); info!("Created {} chunks of 2M (approx.)", chunked.len()); - for (i, ele) in chunked.iter_mut().enumerate() { + for (_i, ele) in chunked.iter_mut().enumerate() { let mut subset_loaded_data: Vec> = self.load_subset(ele); debug!("Starting to deduplicate the corpus - {}", idx); @@ -582,14 +582,12 @@ impl CGCorpus { } mod tests { - use crate::dedup::CGCorpus; - use crate::networkx::{ - CallGraphFuncWithMetadata, CallGraphNodeFeatureType, CallGraphTypes, NetworkxDiGraph, - }; - use std::fs; - use std::fs::read_to_string; - use std::path::{Path, PathBuf}; - use walkdir::WalkDir; + + + + + + // Test Dedup on typed CG's #[test] diff --git a/src/extract.rs b/src/extract.rs index 2c7a47a..aeaa1c1 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -503,26 +503,6 @@ impl FileToBeProcessed { json_obj } - fn get_function_info( - &self, - function_addr: u64, - r2p: &mut R2Pipe, - ) -> Result, r2pipe::Error> { - Self::go_to_address(r2p, function_addr); - let json = r2p.cmd("afij"); - if json.is_ok() { - let json_obj: Vec = serde_json::from_str(&json.as_ref().unwrap()) - .expect(&format!( - "Unable to convert to JSON object! - {}", - json.unwrap() - )); - - Ok(json_obj) - } else { - Err(json.unwrap_err()) - } - } - // Helper Functions fn write_to_json(&self, json_obj: &Value) { From 52473fc6032435063c4d930e45115ee35f1122a6 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 18 Feb 2024 21:40:47 +0000 Subject: [PATCH 18/40] adding support for threaded tiknib feature generation --- src/main.rs | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/src/main.rs b/src/main.rs index 6ccbb00..b98ea7e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -787,18 +787,39 @@ fn main() { info!("Generation complete"); } else if data_source_type == "agfj" { warn!("This currently only supports making TikNib features for single files"); - let mut file = AGFJFile { - functions: None, - filename: input_path.to_owned(), - output_path: output_path.to_owned(), - min_blocks: 1, // Dummy - feature_type: None, - architecture: None, - reg_norm: false, // Dummy - }; - file.load_and_deserialize().expect("Unable to load data"); - file.tiknib_func_level_feature_gen() + if input_path.is_file() { + let mut file = AGFJFile { + functions: None, + filename: input_path.to_owned(), + output_path: output_path.to_owned(), + min_blocks: 1, // Dummy + feature_type: None, + architecture: None, + reg_norm: false, // Dummy + }; + + file.load_and_deserialize().expect("Unable to load data"); + file.tiknib_func_level_feature_gen() + } else { + let mut file_paths_vec = + get_json_paths_from_dir(input_path, Some("_cfg".to_string())); + + file_paths_vec.par_iter().for_each(|filepath| { + let mut file = AGFJFile { + functions: None, + filename: filepath.to_owned().parse().unwrap(), + output_path: output_path.to_owned(), + min_blocks: 1, // Dummy + feature_type: None, + architecture: None, + reg_norm: false, // Dummy + }; + + file.load_and_deserialize().expect("Unable to load data"); + file.tiknib_func_level_feature_gen() + }); + } } } GenerateSubCommands::Nlp { From 84ad960b2b9886dc63dfb7ddd3554038aa47d1cb Mon Sep 17 00:00:00 2001 From: br0kej Date: Thu, 22 Feb 2024 14:15:22 +0000 Subject: [PATCH 19/40] fixing tests --- src/dedup.rs | 111 ++++++++++++++++++++++++++------------------------- 1 file changed, 57 insertions(+), 54 deletions(-) diff --git a/src/dedup.rs b/src/dedup.rs index e253e87..ec3e671 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -601,40 +601,40 @@ mod tests { &"cisco".to_string(), CallGraphNodeFeatureType::CGName, ); - assert_eq!(corpus.as_ref().unwrap().filepaths.len(), 12); - assert_eq!( - corpus.as_ref().unwrap().output_path, - PathBuf::from("test-files/cg_dedup/deduped/") - ); - assert_eq!( - corpus.as_ref().unwrap().filepath_format, - "cisco".to_string() - ); - - // clean up - if corpus.unwrap().output_path.is_dir() { - fs::remove_dir_all(&corpus.unwrap().output_path).expect("Unable to remove directory!"); - }; - let corpus = CGCorpus::new( - &PathBuf::from("test-files/cg_dedup/to_dedup"), - &PathBuf::from("test-files/cg_dedup/deduped/"), - &"cisco".to_string(), - CallGraphNodeFeatureType::CGName, - ); + if corpus.is_ok() { + let corpus = corpus.unwrap(); + assert_eq!(corpus.filepaths.len(), 12); + assert_eq!( + corpus.output_path, + PathBuf::from("test-files/cg_dedup/deduped/") + ); + assert_eq!(corpus.filepath_format, "cisco".to_string()); + + // clean up + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + }; - assert_eq!(corpus.as_ref().unwrap().filepaths.len(), 12); - assert_eq!( - corpus.as_ref().unwrap().output_path, - PathBuf::from("test-files/cg_dedup/deduped/") - ); - assert_eq!( - corpus.as_ref().unwrap().filepath_format, - "cisco".to_string() - ); - // clean up - if corpus.unwrap().output_path.is_dir() { - fs::remove_dir_all(&corpus.unwrap().output_path).expect("Unable to remove directory!"); + let corpus = CGCorpus::new( + &PathBuf::from("test-files/cg_dedup/to_dedup"), + &PathBuf::from("test-files/cg_dedup/deduped/"), + &"cisco".to_string(), + CallGraphNodeFeatureType::CGName, + ); + if corpus.is_ok() { + let corpus = corpus.unwrap(); + assert_eq!(corpus.filepaths.len(), 12); + assert_eq!( + corpus.output_path, + PathBuf::from("test-files/cg_dedup/deduped/") + ); + assert_eq!(corpus.filepath_format, "cisco".to_string()); + // clean up + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + } + } } } @@ -647,28 +647,31 @@ mod tests { CallGraphNodeFeatureType::CGMeta, ); - let fp_binaries = corpus.unwrap().extract_binary_from_fps(); - assert_eq!(fp_binaries.len(), 12); - assert_eq!( - fp_binaries, - vec![ - PathBuf::from("testbin"), - PathBuf::from("testbin"), - PathBuf::from("testbin"), - PathBuf::from("testbin"), - PathBuf::from("testbin"), - PathBuf::from("testbin"), - PathBuf::from("testbin"), - PathBuf::from("testbin"), - PathBuf::from("testbin2"), - PathBuf::from("testbin2"), - PathBuf::from("testbin2"), - PathBuf::from("testbin2"), - ] - ); - // clean up - if corpus.unwrap().output_path.is_dir() { - fs::remove_dir_all(&corpus.unwrap().output_path).expect("Unable to remove directory!"); + if corpus.is_ok() { + let corpus = corpus.unwrap(); + let fp_binaries = corpus.extract_binary_from_fps(); + assert_eq!(fp_binaries.len(), 12); + assert_eq!( + fp_binaries, + vec![ + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin"), + PathBuf::from("testbin2"), + PathBuf::from("testbin2"), + PathBuf::from("testbin2"), + PathBuf::from("testbin2"), + ] + ); + // clean up + if corpus.output_path.is_dir() { + fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + } } } From e6ce3b2b9361e4365a1003a8076000e431f97981 Mon Sep 17 00:00:00 2001 From: br0kej Date: Thu, 22 Feb 2024 14:35:59 +0000 Subject: [PATCH 20/40] adding support for extended aaa analysis for r2 extraction --- src/extract.rs | 29 ++++++++++++++++++----------- src/main.rs | 24 ++++++++++++++---------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/src/extract.rs b/src/extract.rs index 2c7a47a..23db6a3 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -304,9 +304,9 @@ impl ExtractionJob { } impl FileToBeProcessed { - pub fn extract_register_behaviour(&self, debug: &bool) { + pub fn extract_register_behaviour(&self, debug: &bool, extended_analysis: &bool) { info!("Starting register behaviour extraction"); - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); + let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); let function_details = self.get_function_name_list(&mut r2p); if function_details.is_ok() { let mut register_behaviour_vec: HashMap = @@ -335,7 +335,7 @@ impl FileToBeProcessed { } // TODO: Refactor this so it uses the AGFJ struct - pub fn extract_func_cfgs(&self, debug: &bool) { + pub fn extract_func_cfgs(&self, debug: &bool, extended_analysis: &bool) { let mut fp_filename = Path::new(&self.file_path) .file_name() .expect("Unable to get filename") @@ -347,7 +347,7 @@ impl FileToBeProcessed { info!("{} not found. Continuing processing.", f_name); // This creates HUGE JSON files for each files // Approximately 40x file size to JSON - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); + let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); info!("Executing agfj @@f on {:?}", self.file_path); let mut json = r2p.cmd("agfj @@f").expect("Command failed.."); @@ -385,9 +385,9 @@ impl FileToBeProcessed { } } - pub fn extract_function_call_graphs(&self, debug: &bool) { + pub fn extract_function_call_graphs(&self, debug: &bool, extended_analysis: &bool) { info!("Starting function call graph extraction"); - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); + let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); let json = r2p.cmd("agCj").expect("agCj command failed to execute"); let function_call_graphs: Vec = serde_json::from_str(&json).expect("Unable to convert to JSON object!"); @@ -399,8 +399,8 @@ impl FileToBeProcessed { self.write_to_json(&json!(function_call_graphs)) } - pub fn extract_function_xrefs(&self, debug: &bool) { - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); + pub fn extract_function_xrefs(&self, debug: &bool, extended_analysis: &bool) { + let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); let function_details = self.get_function_name_list(&mut r2p); let mut function_xrefs: HashMap> = HashMap::new(); info!("Extracting xrefs for each function"); @@ -423,7 +423,7 @@ impl FileToBeProcessed { } } - pub fn extract_function_info(&self, debug: &bool) { + pub fn extract_function_info(&self, debug: &bool, extended_analysis: &bool) { info!("Starting function metdata extraction"); let mut fp_filename = self .file_path @@ -435,7 +435,7 @@ impl FileToBeProcessed { fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); let f_name = format!("{:?}/{}.json", self.output_path, fp_filename); if !Path::new(&f_name).exists() { - let mut r2p = self.setup_r2_pipe(&self.file_path, debug); + let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); let function_details: Result, r2pipe::Error> = self.get_function_name_list(&mut r2p); @@ -552,7 +552,7 @@ impl FileToBeProcessed { .expect("failed to seek addr"); } - fn setup_r2_pipe(&self, s: &PathBuf, debug: &bool) -> R2Pipe { + fn setup_r2_pipe(&self, s: &PathBuf, debug: &bool, extended_analysis: &bool) -> R2Pipe { // Setup R2 pipe with options and return it // Could be extended to include toggling of options // + more args? @@ -576,11 +576,18 @@ impl FileToBeProcessed { R2Pipe::spawn(s.to_str().unwrap(), Some(opts)).expect("Failed to spawn new R2Pipe") } }; + if *extended_analysis { + debug!("Executing 'aaa' r2 command for {:?}", s); + r2p.cmd("aaa") + .expect("Unable to complete standard analysis!"); + debug!("'aaa' r2 command complete for {:?}", s); + } else { debug!("Executing 'aa' r2 command for {:?}", s); r2p.cmd("aa") .expect("Unable to complete standard analysis!"); debug!("'aa' r2 command complete for {:?}", s); + } r2p } } diff --git a/src/main.rs b/src/main.rs index 0ad10c9..0ef1dce 100644 --- a/src/main.rs +++ b/src/main.rs @@ -259,6 +259,9 @@ enum Commands { #[arg(long, default_value = "false")] debug: bool, + + #[arg(long, default_value = "false")] + extended_analysis: bool, }, /// Generate single embeddings on the fly /// @@ -895,6 +898,7 @@ fn main() { mode, num_threads, debug, + extended_analysis, } => { info!("Creating extraction job"); let job = ExtractionJob::new(fpath, output_dir, mode).unwrap(); @@ -915,7 +919,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_func_cfgs(debug)); + .for_each(|path| path.extract_func_cfgs(debug, extended_analysis)); } else if job.job_type == ExtractionJobType::RegisterBehaviour { info!("Extraction Job Type: Register Behaviour"); info!("Starting Parallel generation."); @@ -923,7 +927,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_register_behaviour(debug)); + .for_each(|path| path.extract_register_behaviour(debug, extended_analysis)); } else if job.job_type == ExtractionJobType::FunctionXrefs { info!("Extraction Job Type: Function Xrefs"); info!("Starting Parallel generation."); @@ -931,7 +935,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_function_xrefs(debug)); + .for_each(|path| path.extract_function_xrefs(debug, extended_analysis)); } else if job.job_type == ExtractionJobType::CallGraphs { info!("Extraction Job Type: Call Graphs"); info!("Starting Parallel generation."); @@ -939,7 +943,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_function_call_graphs(debug)); + .for_each(|path| path.extract_function_call_graphs(debug, extended_analysis)); } else if job.job_type == ExtractionJobType::FuncInfo { info!("Extraction Job Type: Function Info"); info!("Starting Parallel generation."); @@ -947,25 +951,25 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_function_info(debug)); + .for_each(|path| path.extract_function_info(debug, extended_analysis)); } } else if job.input_path_type == PathType::File { info!("Single file found"); if job.job_type == ExtractionJobType::CFG { info!("Extraction Job Type: CFG"); - job.files_to_be_processed[0].extract_func_cfgs(debug); + job.files_to_be_processed[0].extract_func_cfgs(debug, extended_analysis); } else if job.job_type == ExtractionJobType::RegisterBehaviour { info!("Extraction Job Type: Register Behaviour"); - job.files_to_be_processed[0].extract_register_behaviour(debug) + job.files_to_be_processed[0].extract_register_behaviour(debug, extended_analysis) } else if job.job_type == ExtractionJobType::FunctionXrefs { info!("Extraction Job type: Function Xrefs"); - job.files_to_be_processed[0].extract_function_xrefs(debug) + job.files_to_be_processed[0].extract_function_xrefs(debug, extended_analysis) } else if job.job_type == ExtractionJobType::CallGraphs { info!("Extraction Job type: Function Call Graphs"); - job.files_to_be_processed[0].extract_function_call_graphs(debug) + job.files_to_be_processed[0].extract_function_call_graphs(debug, extended_analysis) } else if job.job_type == ExtractionJobType::FuncInfo { info!("Extraction Job type: Function Info"); - job.files_to_be_processed[0].extract_function_info(debug) + job.files_to_be_processed[0].extract_function_info(debug, extended_analysis) } info!("Extraction complete for {:?}", fpath) } From 15d9ea850c2a94b29b829342564a6716e6eef945 Mon Sep 17 00:00:00 2001 From: br0kej Date: Fri, 23 Feb 2024 07:52:17 +0000 Subject: [PATCH 21/40] [refactor] adding a r2 pipe config struct to simplfy function calls and extract code in general --- src/extract.rs | 84 ++++++++++++++++++++++++++++++-------------------- src/main.rs | 22 ++++++------- 2 files changed, 61 insertions(+), 45 deletions(-) diff --git a/src/extract.rs b/src/extract.rs index 23db6a3..e305126 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -38,6 +38,7 @@ pub struct FileToBeProcessed { pub file_path: PathBuf, pub output_path: PathBuf, pub job_type_suffix: String, + pub r2p_config: R2PipeConfig, } #[derive(Debug)] @@ -49,6 +50,12 @@ pub struct ExtractionJob { pub output_path: PathBuf, // Remove - Kept for backwards compat } +#[derive(Debug, Clone, Copy)] +pub struct R2PipeConfig { + pub debug: bool, + pub extended_analysis: bool, +} + impl std::fmt::Display for ExtractionJob { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( @@ -159,7 +166,6 @@ pub struct Codexref { // Structs related to AEAFJ #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] pub struct AEAFJRegisterBehaviour { #[serde(rename = "A")] pub a: Vec, @@ -200,12 +206,13 @@ impl std::fmt::Display for AFLJFuncDetails { } } -impl From<(String, String, String)> for FileToBeProcessed { - fn from(orig: (String, String, String)) -> FileToBeProcessed { +impl From<(String, String, String, R2PipeConfig)> for FileToBeProcessed { + fn from(orig: (String, String, String, R2PipeConfig)) -> FileToBeProcessed { FileToBeProcessed { file_path: PathBuf::from(orig.0), output_path: PathBuf::from(orig.1), job_type_suffix: orig.2, + r2p_config: orig.3.clone(), } } } @@ -215,6 +222,8 @@ impl ExtractionJob { input_path: &PathBuf, output_path: &PathBuf, mode: &str, + debug: &bool, + extended_analysis: &bool, ) -> Result { fn get_path_type(bin_path: &PathBuf) -> PathType { let fpath_md = fs::metadata(bin_path).unwrap(); @@ -241,6 +250,11 @@ impl ExtractionJob { } } + let r2_handle_config = R2PipeConfig { + debug: *debug, + extended_analysis: *extended_analysis, + }; + let p_type = get_path_type(input_path); let job_type = extraction_job_matcher(mode).unwrap(); @@ -249,6 +263,7 @@ impl ExtractionJob { file_path: input_path.to_owned(), output_path: output_path.to_owned(), job_type_suffix: (*mode).to_string(), + r2p_config: r2_handle_config, }; Ok(ExtractionJob { input_path: input_path.to_owned(), @@ -259,13 +274,15 @@ impl ExtractionJob { }) } else if p_type == PathType::Dir { let files = ExtractionJob::get_file_paths_dir(input_path); - let files_with_output_path: Vec<(String, String, String)> = files + + let files_with_output_path: Vec<(String, String, String, R2PipeConfig)> = files .into_iter() .map(|f| { ( f, output_path.to_string_lossy().to_string(), mode.to_string(), + r2_handle_config, ) }) .collect(); @@ -304,9 +321,9 @@ impl ExtractionJob { } impl FileToBeProcessed { - pub fn extract_register_behaviour(&self, debug: &bool, extended_analysis: &bool) { + pub fn extract_register_behaviour(&self) { info!("Starting register behaviour extraction"); - let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); + let mut r2p = self.setup_r2_pipe(); let function_details = self.get_function_name_list(&mut r2p); if function_details.is_ok() { let mut register_behaviour_vec: HashMap = @@ -335,7 +352,7 @@ impl FileToBeProcessed { } // TODO: Refactor this so it uses the AGFJ struct - pub fn extract_func_cfgs(&self, debug: &bool, extended_analysis: &bool) { + pub fn extract_func_cfgs(&self) { let mut fp_filename = Path::new(&self.file_path) .file_name() .expect("Unable to get filename") @@ -347,7 +364,7 @@ impl FileToBeProcessed { info!("{} not found. Continuing processing.", f_name); // This creates HUGE JSON files for each files // Approximately 40x file size to JSON - let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); + let mut r2p = self.setup_r2_pipe(); info!("Executing agfj @@f on {:?}", self.file_path); let mut json = r2p.cmd("agfj @@f").expect("Command failed.."); @@ -385,9 +402,9 @@ impl FileToBeProcessed { } } - pub fn extract_function_call_graphs(&self, debug: &bool, extended_analysis: &bool) { + pub fn extract_function_call_graphs(&self) { info!("Starting function call graph extraction"); - let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); + let mut r2p = self.setup_r2_pipe(); let json = r2p.cmd("agCj").expect("agCj command failed to execute"); let function_call_graphs: Vec = serde_json::from_str(&json).expect("Unable to convert to JSON object!"); @@ -399,8 +416,8 @@ impl FileToBeProcessed { self.write_to_json(&json!(function_call_graphs)) } - pub fn extract_function_xrefs(&self, debug: &bool, extended_analysis: &bool) { - let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); + pub fn extract_function_xrefs(&self) { + let mut r2p = self.setup_r2_pipe(); let function_details = self.get_function_name_list(&mut r2p); let mut function_xrefs: HashMap> = HashMap::new(); info!("Extracting xrefs for each function"); @@ -423,7 +440,7 @@ impl FileToBeProcessed { } } - pub fn extract_function_info(&self, debug: &bool, extended_analysis: &bool) { + pub fn extract_function_info(&self) { info!("Starting function metdata extraction"); let mut fp_filename = self .file_path @@ -435,7 +452,7 @@ impl FileToBeProcessed { fp_filename = fp_filename + "_" + &self.job_type_suffix.clone(); let f_name = format!("{:?}/{}.json", self.output_path, fp_filename); if !Path::new(&f_name).exists() { - let mut r2p = self.setup_r2_pipe(&self.file_path, debug, extended_analysis); + let mut r2p = self.setup_r2_pipe(); let function_details: Result, r2pipe::Error> = self.get_function_name_list(&mut r2p); @@ -552,42 +569,41 @@ impl FileToBeProcessed { .expect("failed to seek addr"); } - fn setup_r2_pipe(&self, s: &PathBuf, debug: &bool, extended_analysis: &bool) -> R2Pipe { - // Setup R2 pipe with options and return it - // Could be extended to include toggling of options - // + more args? - let opts = if !(*debug) { - debug!("Creating r2 handle without debugging"); + fn setup_r2_pipe(&self) -> R2Pipe { + + let opts = if self.r2p_config.debug { + debug!("Creating r2 handle with debugging"); R2PipeSpawnOptions { exepath: "r2".to_owned(), - args: vec!["-e bin.cache=true", "-e log.level=1", "-2"], + args: vec!["-e bin.cache=true", "-e log.level=0"], } } else { - debug!("Creating r2 handle with debugging"); + debug!("Creating r2 handle without debugging"); R2PipeSpawnOptions { exepath: "r2".to_owned(), - args: vec!["-e bin.cache=true", "-e log.level=0"], + args: vec!["-e bin.cache=true", "-e log.level=1", "-2"], } }; - debug!("Attempting to create r2pipe using {:?}", s); + + debug!("Attempting to create r2pipe using {:?}", self.file_path); let mut r2p = match R2Pipe::in_session() { Some(_) => R2Pipe::open().expect("Unable to open R2Pipe"), None => { - R2Pipe::spawn(s.to_str().unwrap(), Some(opts)).expect("Failed to spawn new R2Pipe") + R2Pipe::spawn(self.file_path.to_str().unwrap(), Some(opts)).expect("Failed to spawn new R2Pipe") } }; - if *extended_analysis { - debug!("Executing 'aaa' r2 command for {:?}", s); + + if self.r2p_config.extended_analysis { + debug!("Executing 'aaa' r2 command for {}", self.file_path.display()); r2p.cmd("aaa") .expect("Unable to complete standard analysis!"); - debug!("'aaa' r2 command complete for {:?}", s); + debug!("'aaa' r2 command complete for {}", self.file_path.display()); } else { - - debug!("Executing 'aa' r2 command for {:?}", s); - r2p.cmd("aa") - .expect("Unable to complete standard analysis!"); - debug!("'aa' r2 command complete for {:?}", s); - } + debug!("Executing 'aa' r2 command for {}", self.file_path.display()); + r2p.cmd("aa") + .expect("Unable to complete standard analysis!"); + debug!("'aa' r2 command complete for {:?}", self.file_path.display()); + }; r2p } } diff --git a/src/main.rs b/src/main.rs index 0ef1dce..ae150e5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -901,7 +901,7 @@ fn main() { extended_analysis, } => { info!("Creating extraction job"); - let job = ExtractionJob::new(fpath, output_dir, mode).unwrap(); + let job = ExtractionJob::new(fpath, output_dir, mode, debug, extended_analysis).unwrap(); if job.input_path_type == PathType::Dir { info!("Directory found - will parallel process"); @@ -919,7 +919,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_func_cfgs(debug, extended_analysis)); + .for_each(|path| path.extract_func_cfgs()); } else if job.job_type == ExtractionJobType::RegisterBehaviour { info!("Extraction Job Type: Register Behaviour"); info!("Starting Parallel generation."); @@ -927,7 +927,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_register_behaviour(debug, extended_analysis)); + .for_each(|path| path.extract_register_behaviour()); } else if job.job_type == ExtractionJobType::FunctionXrefs { info!("Extraction Job Type: Function Xrefs"); info!("Starting Parallel generation."); @@ -935,7 +935,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_function_xrefs(debug, extended_analysis)); + .for_each(|path| path.extract_function_xrefs()); } else if job.job_type == ExtractionJobType::CallGraphs { info!("Extraction Job Type: Call Graphs"); info!("Starting Parallel generation."); @@ -943,7 +943,7 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_function_call_graphs(debug, extended_analysis)); + .for_each(|path| path.extract_function_call_graphs()); } else if job.job_type == ExtractionJobType::FuncInfo { info!("Extraction Job Type: Function Info"); info!("Starting Parallel generation."); @@ -951,25 +951,25 @@ fn main() { job.files_to_be_processed .par_iter() .progress() - .for_each(|path| path.extract_function_info(debug, extended_analysis)); + .for_each(|path| path.extract_function_info()); } } else if job.input_path_type == PathType::File { info!("Single file found"); if job.job_type == ExtractionJobType::CFG { info!("Extraction Job Type: CFG"); - job.files_to_be_processed[0].extract_func_cfgs(debug, extended_analysis); + job.files_to_be_processed[0].extract_func_cfgs(); } else if job.job_type == ExtractionJobType::RegisterBehaviour { info!("Extraction Job Type: Register Behaviour"); - job.files_to_be_processed[0].extract_register_behaviour(debug, extended_analysis) + job.files_to_be_processed[0].extract_register_behaviour() } else if job.job_type == ExtractionJobType::FunctionXrefs { info!("Extraction Job type: Function Xrefs"); - job.files_to_be_processed[0].extract_function_xrefs(debug, extended_analysis) + job.files_to_be_processed[0].extract_function_xrefs() } else if job.job_type == ExtractionJobType::CallGraphs { info!("Extraction Job type: Function Call Graphs"); - job.files_to_be_processed[0].extract_function_call_graphs(debug, extended_analysis) + job.files_to_be_processed[0].extract_function_call_graphs() } else if job.job_type == ExtractionJobType::FuncInfo { info!("Extraction Job type: Function Info"); - job.files_to_be_processed[0].extract_function_info(debug, extended_analysis) + job.files_to_be_processed[0].extract_function_info() } info!("Extraction complete for {:?}", fpath) } From 3e60af1db4283a6b43ae5618b8f6757334ca2245 Mon Sep 17 00:00:00 2001 From: br0kej Date: Fri, 23 Feb 2024 07:59:01 +0000 Subject: [PATCH 22/40] [refactor] cleaning up comments/exceptions --- src/extract.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/extract.rs b/src/extract.rs index e305126..766b5f1 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -345,13 +345,12 @@ impl FileToBeProcessed { self.write_to_json(&json!(register_behaviour_vec)) } else { error!( - "Failed to extract function register - Error in r2 extraction for {:?}", + "Failed to extract function details to generate register behaviour - Error in r2 extraction for {:?}", self.file_path ) } } - // TODO: Refactor this so it uses the AGFJ struct pub fn extract_func_cfgs(&self) { let mut fp_filename = Path::new(&self.file_path) .file_name() @@ -362,15 +361,11 @@ impl FileToBeProcessed { let f_name = format!("{:?}/{}.json", &self.output_path, fp_filename); if !Path::new(&f_name).exists() { info!("{} not found. Continuing processing.", f_name); - // This creates HUGE JSON files for each files - // Approximately 40x file size to JSON let mut r2p = self.setup_r2_pipe(); info!("Executing agfj @@f on {:?}", self.file_path); - let mut json = r2p.cmd("agfj @@f").expect("Command failed.."); - + let mut json = r2p.cmd("agfj @@f").expect("Failed to extract control flow graph information."); info!("Closing r2p process for {:?}", self.file_path); r2p.close(); - info!("Starting JSON fixup for {:?}", self.file_path); // Fix JSON object json = json.replace("[]\n", ","); From ace10bf3ad32c85469631c776fdcb8cf794c37ec Mon Sep 17 00:00:00 2001 From: br0kej Date: Fri, 23 Feb 2024 08:05:13 +0000 Subject: [PATCH 23/40] [tidy] remove dead code, handle errors for dedup when output folder is already present --- src/dedup.rs | 12 ++++++++---- src/extract.rs | 19 ------------------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/src/dedup.rs b/src/dedup.rs index ec3e671..84c59a9 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -304,8 +304,12 @@ impl CGCorpus { node_type: CallGraphNodeFeatureType, ) -> Result { if !output_path.exists() { - fs::create_dir(output_path).expect("Failed to create output directory!"); - info!("Output path not found - Creating {:?}", output_path) + let ret = fs::create_dir(output_path); + if ret.is_ok() { + info!("Output path not found - Creating {:?}", output_path) + } else { + info!("Output path {:?} found", output_path) + } } let mut filepaths: Vec = Vec::new(); @@ -325,7 +329,7 @@ impl CGCorpus { Ok(CGCorpus { filepaths, - output_path: output_path, + output_path, filepath_format: filepath_format.to_string(), node_type, }) @@ -512,7 +516,7 @@ impl CGCorpus { for (i, ele) in chunked.iter_mut().enumerate() { let mut subset_loaded_data: Vec> = self.load_subset(ele); - debug!("Starting to deduplicate the corpus - {}", idx); + debug!("Starting to deduplicate chunk {} for corpus {}", i, idx); Self::dedup_corpus_inplace(&mut subset_loaded_data, ele); } } else { diff --git a/src/extract.rs b/src/extract.rs index 766b5f1..5ca3ddc 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -515,25 +515,6 @@ impl FileToBeProcessed { json_obj } - fn get_function_info( - &self, - function_addr: u64, - r2p: &mut R2Pipe, - ) -> Result, r2pipe::Error> { - Self::go_to_address(r2p, function_addr); - let json = r2p.cmd("afij"); - if json.is_ok() { - let json_obj: Vec = serde_json::from_str(&json.as_ref().unwrap()) - .expect(&format!( - "Unable to convert to JSON object! - {}", - json.unwrap() - )); - - Ok(json_obj) - } else { - Err(json.unwrap_err()) - } - } // Helper Functions From db0b73d57811ac6f2eca6cc923909a0bd3abd5ca Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sat, 24 Feb 2024 17:19:00 +0000 Subject: [PATCH 24/40] adding initial struct for extended node info --- src/combos.rs | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 src/combos.rs diff --git a/src/combos.rs b/src/combos.rs new file mode 100644 index 0000000..f10b5dc --- /dev/null +++ b/src/combos.rs @@ -0,0 +1,54 @@ +use crate::afij::AFIJFeatureSubset; +use crate::agfj::TikNibFuncFeatures; +use crate::files::FunctionMetadataTypes; +use ordered_float::OrderedFloat; +pub struct FinfoTiknib { + pub name: String, + pub edges: i64, + pub indegree: i64, + pub outdegree: i64, + pub nlocals: i64, + pub nargs: i64, + pub avg_arithshift: OrderedFloat, + pub avg_compare: OrderedFloat, + pub avg_ctransfer: OrderedFloat, + pub avg_ctransfercond: OrderedFloat, + pub avg_dtransfer: OrderedFloat, + pub avg_float: OrderedFloat, + pub avg_total: OrderedFloat, + // Sum + pub sum_arithshift: OrderedFloat, + pub sum_compare: OrderedFloat, + pub sum_ctransfer: OrderedFloat, + pub sum_ctransfercond: OrderedFloat, + pub sum_dtransfer: OrderedFloat, + pub sum_float: OrderedFloat, + pub sum_total: OrderedFloat, +} + +impl From<(AFIJFeatureSubset, TikNibFuncFeatures)> for FinfoTiknib { + fn from(value: (AFIJFeatureSubset, TikNibFuncFeatures)) -> Self { + FinfoTiknib { + name: value.0.name, + edges: value.0.edges, + indegree: value.0.indegree, + outdegree: value.0.outdegree, + nlocals: value.0.nlocals, + nargs: value.0.nargs, + avg_arithshift: value.1.avg_arithshift, + avg_compare: value.1.avg_compare, + avg_ctransfer: value.1.avg_ctransfer, + avg_ctransfercond: value.1.avg_ctransfercond, + avg_dtransfer: value.1.avg_dtransfer, + avg_float: value.1.avg_float, + avg_total: value.1.avg_total, + sum_arithshift: value.1.sum_arithshift, + sum_compare: value.1.sum_compare, + sum_ctransfer: value.1.sum_ctransfer, + sum_ctransfercond: value.1.sum_ctransfercond, + sum_dtransfer: value.1.sum_dtransfer, + sum_float: value.1.sum_float, + sum_total: value.1.sum_total, + } + } +} From e16a9603ad07284ac8bbfc1591b1e81e38345717 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sat, 24 Feb 2024 17:19:33 +0000 Subject: [PATCH 25/40] adding dev cli commands for combos --- src/main.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/main.rs b/src/main.rs index b98ea7e..8a042a4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,6 +22,7 @@ pub mod agfj; pub mod bb; #[cfg(feature = "goblin")] pub mod binnfo; +mod combos; pub mod consts; pub mod dedup; pub mod errors; @@ -225,6 +226,18 @@ enum GenerateSubCommands { #[arg(short, long, value_name = "BPE or Byte-BPE", default_value = "BPE")] tokeniser_type: String, }, + /// Generate combinations of extracted data - Primaryily metadata objects + Combos { + #[arg(short, long, value_name = "INPUT_PATH")] + input_path: PathBuf, + /// The path for the generated output + #[arg(short, long, value_name = "OUTPUT_PATH")] + output_path: PathBuf, + /// Combo Type + #[arg(short, long, value_parser = clap::builder::PossibleValuesParser::new(["finfo+tiknib", "finfoe+tiknib"]) + .map(|s| s.parse::().unwrap()))] + combo_type: String, + }, } #[derive(Subcommand)] @@ -822,6 +835,18 @@ fn main() { } } } + GenerateSubCommands::Combos { + input_path, + output_path, + combo_type, + } => { + if combo_type == "finfo+tiknib" { + let mut finfo_paths = + get_json_paths_from_dir(input_path, Some("_finfo".to_string())); + let tiknib_paths = + get_json_paths_from_dir(input_path, Some("cfg-tiknib".to_string())); + } + } GenerateSubCommands::Nlp { path, instruction_type, From c71b014f446e09f6282a7dc3ddeb9f79abc98e2c Mon Sep 17 00:00:00 2001 From: br0kej Date: Sun, 25 Feb 2024 16:03:34 +0000 Subject: [PATCH 26/40] [feature] support for downloading windows symbols if available --- Cargo.toml | 1 + src/extract.rs | 67 ++++++++++++++++++++++++++++++++++++++++++++------ src/main.rs | 6 ++++- 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 47e6e7e..fb6d7c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ env_logger = "0.10.0" thiserror = "1.0.47" enum-as-inner = "0.6.0" ordered-float = { version = "4.2.0", features = ["serde"] } + [dependencies.petgraph] version = "0.6.2" features = ["serde-1"] diff --git a/src/extract.rs b/src/extract.rs index 835a0c5..d3e6d5e 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -1,17 +1,27 @@ use crate::afij::AFIJFunctionInfo; use crate::agcj::AGCJFunctionCallGraphs; +use crate::extract; +use anyhow::anyhow; use anyhow::bail; use anyhow::Error; use anyhow::Result; use r2pipe::R2Pipe; use r2pipe::R2PipeSpawnOptions; +use serde::de; +use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use serde_aux::prelude::*; use serde_json; +use serde_json::error; use serde_json::{json, Value}; use std::collections::HashMap; +use std::env; +use std::fmt::format; +use std::fmt::UpperHex; use std::fs; use std::fs::File; +use std::io::copy; +use std::os::unix::raw::gid_t; use std::path::{Path, PathBuf}; use walkdir::WalkDir; @@ -54,6 +64,7 @@ pub struct ExtractionJob { pub struct R2PipeConfig { pub debug: bool, pub extended_analysis: bool, + pub use_curl_pdb: bool, } impl std::fmt::Display for ExtractionJob { @@ -224,6 +235,7 @@ impl ExtractionJob { mode: &str, debug: &bool, extended_analysis: &bool, + use_curl_pdb: &bool, ) -> Result { fn get_path_type(bin_path: &PathBuf) -> PathType { let fpath_md = fs::metadata(bin_path).unwrap(); @@ -253,6 +265,7 @@ impl ExtractionJob { let r2_handle_config = R2PipeConfig { debug: *debug, extended_analysis: *extended_analysis, + use_curl_pdb: *use_curl_pdb, }; let p_type = get_path_type(input_path); @@ -363,7 +376,9 @@ impl FileToBeProcessed { info!("{} not found. Continuing processing.", f_name); let mut r2p = self.setup_r2_pipe(); info!("Executing agfj @@f on {:?}", self.file_path); - let mut json = r2p.cmd("agfj @@f").expect("Failed to extract control flow graph information."); + let mut json = r2p + .cmd("agfj @@f") + .expect("Failed to extract control flow graph information."); info!("Closing r2p process for {:?}", self.file_path); r2p.close(); info!("Starting JSON fixup for {:?}", self.file_path); @@ -516,7 +531,6 @@ impl FileToBeProcessed { } // Helper Functions - fn write_to_json(&self, json_obj: &Value) { let mut fp_filename = self .file_path @@ -544,7 +558,28 @@ impl FileToBeProcessed { .expect("failed to seek addr"); } + fn handle_symbols_pdb(&self, r2p: &mut R2Pipe) -> Result<(), Error> { + // Download symbols if available + debug!("Downloading pdb file for {:?}", self.file_path); + let download_pdb = r2p.cmd("idpd"); + + debug!("Download PDB Ret: {:?}", download_pdb); + + if download_pdb.unwrap().contains("success") { + let ret = r2p.cmd("idp"); + debug!("Return value: {:?}", ret); + + Ok(()) + } else { + Err(anyhow!("Unable to download pdb")) + } + } + fn setup_r2_pipe(&self) -> R2Pipe { + if self.r2p_config.use_curl_pdb { + // Docs suggest this is unsafe + env::set_var("R2_CURL", "1"); + } let opts = if self.r2p_config.debug { debug!("Creating r2 handle with debugging"); @@ -563,13 +598,28 @@ impl FileToBeProcessed { debug!("Attempting to create r2pipe using {:?}", self.file_path); let mut r2p = match R2Pipe::in_session() { Some(_) => R2Pipe::open().expect("Unable to open R2Pipe"), - None => { - R2Pipe::spawn(self.file_path.to_str().unwrap(), Some(opts)).expect("Failed to spawn new R2Pipe") - } + None => R2Pipe::spawn(self.file_path.to_str().unwrap(), Some(opts)) + .expect("Failed to spawn new R2Pipe"), }; + let info = r2p.cmdj("ij"); + if info.is_ok() { + let info = info.unwrap(); + if info["bin"]["bintype"].as_str().unwrap() == "pe" { + debug!("PE file found. Handling symbol download!"); + let ret = self.handle_symbols_pdb(&mut r2p); + + if ret.is_err() { + error!("Unable to get PDB info") + } + } + } + if self.r2p_config.extended_analysis { - debug!("Executing 'aaa' r2 command for {}", self.file_path.display()); + debug!( + "Executing 'aaa' r2 command for {}", + self.file_path.display() + ); r2p.cmd("aaa") .expect("Unable to complete standard analysis!"); debug!("'aaa' r2 command complete for {}", self.file_path.display()); @@ -577,7 +627,10 @@ impl FileToBeProcessed { debug!("Executing 'aa' r2 command for {}", self.file_path.display()); r2p.cmd("aa") .expect("Unable to complete standard analysis!"); - debug!("'aa' r2 command complete for {:?}", self.file_path.display()); + debug!( + "'aa' r2 command complete for {:?}", + self.file_path.display() + ); }; r2p } diff --git a/src/main.rs b/src/main.rs index 1e0ebaf..adbf6e9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -278,6 +278,9 @@ enum Commands { #[arg(long, default_value = "false")] extended_analysis: bool, + + #[arg(long, default_value ="true")] + use_curl_pdb: bool, }, /// Generate single embeddings on the fly /// @@ -949,9 +952,10 @@ fn main() { num_threads, debug, extended_analysis, + use_curl_pdb, } => { info!("Creating extraction job"); - let job = ExtractionJob::new(fpath, output_dir, mode, debug, extended_analysis).unwrap(); + let job = ExtractionJob::new(fpath, output_dir, mode, debug, extended_analysis, use_curl_pdb).unwrap(); if job.input_path_type == PathType::Dir { info!("Directory found - will parallel process"); From f908f8a6d271f656a8770b8323f1dbda6802b896 Mon Sep 17 00:00:00 2001 From: br0kej Date: Sun, 25 Feb 2024 16:04:18 +0000 Subject: [PATCH 27/40] [tidy] clippy suggestsions --- src/combos.rs | 2 +- src/dedup.rs | 12 ++++++------ src/extract.rs | 20 ++++++++++---------- src/main.rs | 8 ++++---- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/combos.rs b/src/combos.rs index f10b5dc..e61be24 100644 --- a/src/combos.rs +++ b/src/combos.rs @@ -1,6 +1,6 @@ use crate::afij::AFIJFeatureSubset; use crate::agfj::TikNibFuncFeatures; -use crate::files::FunctionMetadataTypes; + use ordered_float::OrderedFloat; pub struct FinfoTiknib { pub name: String, diff --git a/src/dedup.rs b/src/dedup.rs index dc1981d..6fa2db6 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -124,7 +124,7 @@ impl EsilFuncStringCorpus { } let mut output_path = output_path.to_owned(); - if !output_path.to_string_lossy().to_string().ends_with("/") { + if !output_path.to_string_lossy().to_string().ends_with('/') { output_path.push("/"); }; @@ -574,7 +574,7 @@ impl CGCorpus { .for_each(|(data_ele, filepath)| { let save_path = Self::generate_dedup_filepath(&self.output_path, filepath); let dirs = save_path.parent().unwrap_or(Path::new("")); - fs::create_dir_all(&dirs).expect("Failed to create output directory!"); + fs::create_dir_all(dirs).expect("Failed to create output directory!"); serde_json::to_writer( &File::create(save_path).expect("Failed to create writer"), @@ -586,10 +586,10 @@ impl CGCorpus { } mod tests { - use std::{fs::{self, read_to_string}, path::{Path, PathBuf}}; - use walkdir::WalkDir; - use crate::networkx::{CallGraphFuncWithMetadata, CallGraphNodeFeatureType, CallGraphTypes, NetworkxDiGraph}; - use super::CGCorpus; + + + + // Test Dedup on typed CG's #[test] diff --git a/src/extract.rs b/src/extract.rs index d3e6d5e..65ffbad 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -1,27 +1,27 @@ use crate::afij::AFIJFunctionInfo; use crate::agcj::AGCJFunctionCallGraphs; -use crate::extract; + use anyhow::anyhow; use anyhow::bail; use anyhow::Error; use anyhow::Result; use r2pipe::R2Pipe; use r2pipe::R2PipeSpawnOptions; -use serde::de; -use serde::de::DeserializeOwned; + + use serde::{Deserialize, Serialize}; use serde_aux::prelude::*; use serde_json; -use serde_json::error; + use serde_json::{json, Value}; use std::collections::HashMap; use std::env; -use std::fmt::format; -use std::fmt::UpperHex; + + use std::fs; use std::fs::File; -use std::io::copy; -use std::os::unix::raw::gid_t; + + use std::path::{Path, PathBuf}; use walkdir::WalkDir; @@ -223,7 +223,7 @@ impl From<(String, String, String, R2PipeConfig)> for FileToBeProcessed { file_path: PathBuf::from(orig.0), output_path: PathBuf::from(orig.1), job_type_suffix: orig.2, - r2p_config: orig.3.clone(), + r2p_config: orig.3, } } } @@ -491,7 +491,7 @@ impl FileToBeProcessed { let json = r2p.cmd("aflj"); if json.is_ok() { - let json_obj: Vec = serde_json::from_str(&json.as_ref().unwrap()) + let json_obj: Vec = serde_json::from_str(json.as_ref().unwrap()) .expect(&format!( "Unable to convert to JSON object! - {}", json.unwrap() diff --git a/src/main.rs b/src/main.rs index adbf6e9..bccc8a0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -821,7 +821,7 @@ fn main() { file.load_and_deserialize().expect("Unable to load data"); file.tiknib_func_level_feature_gen() } else { - let mut file_paths_vec = + let file_paths_vec = get_json_paths_from_dir(input_path, Some("_cfg".to_string())); file_paths_vec.par_iter().for_each(|filepath| { @@ -843,13 +843,13 @@ fn main() { } GenerateSubCommands::Combos { input_path, - output_path, + output_path: _, combo_type, } => { if combo_type == "finfo+tiknib" { - let mut finfo_paths = + let _finfo_paths = get_json_paths_from_dir(input_path, Some("_finfo".to_string())); - let tiknib_paths = + let _tiknib_paths = get_json_paths_from_dir(input_path, Some("cfg-tiknib".to_string())); } } From 8f9844e53871d2deb72b3bb775504678a93bf1bb Mon Sep 17 00:00:00 2001 From: br0kej Date: Mon, 26 Feb 2024 21:07:00 +0000 Subject: [PATCH 28/40] adding changes to support extraction on windows --- src/agcj.rs | 19 +++++++++++++------ src/extract.rs | 4 ++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/agcj.rs b/src/agcj.rs index e6fbb2a..e38ba80 100644 --- a/src/agcj.rs +++ b/src/agcj.rs @@ -33,7 +33,7 @@ impl AGCJFunctionCallGraphs { networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { - let full_output_path = + let mut full_output_path = get_save_file_path(binary_name, output_path, Some(type_suffix.to_string())); check_or_create_dir(&full_output_path); @@ -43,14 +43,20 @@ impl AGCJFunctionCallGraphs { if function_name.chars().count() > 100 { function_name = self.name[..75].to_string(); } - + let filename = format!( - "{:?}/{}-{}.json", - full_output_path, function_name, type_suffix + "{}-{}.json", + function_name, type_suffix ); + + // Normalise string for windows + let filename = filename.replace(&['(', ')', ',', '\"', ';', ':', '\''][..], ""); + full_output_path.push(filename); + + debug!("Filename to save graphs to: {:?}", full_output_path); serde_json::to_writer( - &File::create(filename).expect("Failed to create writer"), + &File::create(full_output_path).expect("Failed to create writer"), &networkx_graph, ) .expect("Unable to write JSON"); @@ -105,7 +111,8 @@ impl AGCJFunctionCallGraphs { } let filename = format!("{}-{}.json", function_name, type_suffix); - + // Normalise string for windows + let filename = filename.replace(&['(', ')', ',', '\"', ';', ':', '\''][..], ""); full_output_path.push(filename); debug!("Attempting to save to {:?}", full_output_path); diff --git a/src/extract.rs b/src/extract.rs index 65ffbad..00bcb8b 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -584,13 +584,13 @@ impl FileToBeProcessed { let opts = if self.r2p_config.debug { debug!("Creating r2 handle with debugging"); R2PipeSpawnOptions { - exepath: "r2".to_owned(), + exepath: "radare2".to_owned(), args: vec!["-e bin.cache=true", "-e log.level=0"], } } else { debug!("Creating r2 handle without debugging"); R2PipeSpawnOptions { - exepath: "r2".to_owned(), + exepath: "radare2".to_owned(), args: vec!["-e bin.cache=true", "-e log.level=1", "-2"], } }; From fb97d45e658c08d2dde5665341bac387c64f4085 Mon Sep 17 00:00:00 2001 From: br0kej Date: Sat, 2 Mar 2024 12:44:33 +0000 Subject: [PATCH 29/40] fixing errors! --- src/agcj.rs | 6 +++--- src/dedup.rs | 12 +++++++----- src/extract.rs | 4 ++-- src/files.rs | 10 +++++++--- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/agcj.rs b/src/agcj.rs index e38ba80..35f5068 100644 --- a/src/agcj.rs +++ b/src/agcj.rs @@ -11,7 +11,7 @@ use std::path::PathBuf; #[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] -pub struct AGCJFunctionCallGraphs { +pub struct AGCJFunctionCallGraph { pub name: String, pub size: i64, pub imports: Option>, @@ -25,7 +25,7 @@ pub struct AGCJParsedObjects { pub nodes: Vec, } -impl AGCJFunctionCallGraphs { +impl AGCJFunctionCallGraph { fn graph_to_json_func_node( &self, binary_name: &PathBuf, @@ -156,7 +156,7 @@ impl AGCJFunctionCallGraphs { trace!("Imports: {:?}", self.imports); for import in self.imports.as_ref().unwrap().iter() { trace! {"Starting to Process {:?}", import}; - let import_object: &Vec<&AGCJFunctionCallGraphs> = &global_cg + let import_object: &Vec<&AGCJFunctionCallGraph> = &global_cg .function_call_graphs .as_ref() .unwrap() diff --git a/src/dedup.rs b/src/dedup.rs index 6fa2db6..24b0d32 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -586,11 +586,13 @@ impl CGCorpus { } mod tests { - - - - - + use std::fs; + use std::fs::read_to_string; + use std::path::{Path, PathBuf}; + use walkdir::WalkDir; + use crate::dedup::CGCorpus; + use crate::networkx::{CallGraphFuncWithMetadata, CallGraphNodeFeatureType, CallGraphTypes, NetworkxDiGraph}; + // Test Dedup on typed CG's #[test] fn test_cg_corpus_gen() { diff --git a/src/extract.rs b/src/extract.rs index 00bcb8b..3d443cd 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -1,5 +1,5 @@ use crate::afij::AFIJFunctionInfo; -use crate::agcj::AGCJFunctionCallGraphs; +use crate::agcj::AGCJFunctionCallGraph; use anyhow::anyhow; use anyhow::bail; @@ -416,7 +416,7 @@ impl FileToBeProcessed { info!("Starting function call graph extraction"); let mut r2p = self.setup_r2_pipe(); let json = r2p.cmd("agCj").expect("agCj command failed to execute"); - let function_call_graphs: Vec = + let function_call_graphs: Vec = serde_json::from_str(&json).expect("Unable to convert to JSON object!"); info!("Function call graph extracted."); r2p.close(); diff --git a/src/files.rs b/src/files.rs index 2fab1a1..0b5c4aa 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1,5 +1,5 @@ use crate::afij::{AFIJFeatureSubset, AFIJFeatureSubsetExtended, AFIJFunctionInfo}; -use crate::agcj::AGCJFunctionCallGraphs; +use crate::agcj::AGCJFunctionCallGraph; use crate::agfj::{AGFJFunc, TikNibFunc}; use crate::bb::{FeatureType, InstructionMode}; use crate::consts::*; @@ -378,7 +378,7 @@ pub enum FunctionMetadataTypes { #[derive(Serialize, Deserialize, Debug)] pub struct AGCJFile { pub filename: PathBuf, - pub function_call_graphs: Option>, + pub function_call_graphs: Option>, pub output_path: PathBuf, pub function_metadata: Option, pub include_unk: bool, @@ -390,11 +390,15 @@ impl AGCJFile { #[allow(clippy::expect_fun_call)] // Kept in to ensure that the JSON decode error message is printed alongside the filename - let json: Vec = serde_json::from_str(&data)?; + let json: Vec = serde_json::from_str(&data)?; self.function_call_graphs = Some(json); Ok(()) } + + pub fn generate_global_call_graphs(&mut self){ + todo!() + } } #[derive(Serialize, Deserialize, Debug)] From c1b8162257c6a572668464ce50a3b8fb01e789ee Mon Sep 17 00:00:00 2001 From: br0kej Date: Sat, 2 Mar 2024 20:39:21 +0000 Subject: [PATCH 30/40] fixing unreliable test --- src/dedup.rs | 2 +- test-files/cg_dedup/deduped/.gitkeep | 0 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 test-files/cg_dedup/deduped/.gitkeep diff --git a/src/dedup.rs b/src/dedup.rs index 24b0d32..c69d124 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -696,7 +696,7 @@ mod tests { // clean up if corpus.output_path.is_dir() { - fs::remove_dir_all(&corpus.output_path).expect("Unable to remove directory!"); + let _ = fs::remove_dir_all(&corpus.output_path); } } diff --git a/test-files/cg_dedup/deduped/.gitkeep b/test-files/cg_dedup/deduped/.gitkeep new file mode 100644 index 0000000..e69de29 From f7c6c0b643d8a239a17baafea7dd426c1037f95e Mon Sep 17 00:00:00 2001 From: br0kej Date: Sat, 2 Mar 2024 21:12:56 +0000 Subject: [PATCH 31/40] initial implementation of global call graph generation --- src/files.rs | 99 +++++++++++++++++++++++++++- src/main.rs | 23 ++++++- test-files/cg_dedup/deduped/.gitkeep | 0 3 files changed, 119 insertions(+), 3 deletions(-) delete mode 100644 test-files/cg_dedup/deduped/.gitkeep diff --git a/src/files.rs b/src/files.rs index 0b5c4aa..91a94b4 100644 --- a/src/files.rs +++ b/src/files.rs @@ -9,6 +9,7 @@ use crate::inference::InferenceJob; use crate::utils::get_save_file_path; use enum_as_inner::EnumAsInner; use indicatif::ParallelProgressIterator; +use petgraph::{Graph, Incoming, Outgoing}; use rayon::iter::ParallelIterator; use rayon::prelude::{IntoParallelRefIterator, IntoParallelRefMutIterator}; use serde::{Deserialize, Serialize}; @@ -21,6 +22,10 @@ use std::string::String; use std::sync::mpsc::channel; #[cfg(feature = "inference")] use std::sync::Arc; +use itertools::Itertools; +use petgraph::visit::IntoEdgesDirected; +#[cfg(feature = "inference")] +use tch::nn::func; #[derive(Serialize, Deserialize, Debug)] pub struct AGFJFile { @@ -396,8 +401,55 @@ impl AGCJFile { Ok(()) } - pub fn generate_global_call_graphs(&mut self){ - todo!() + pub fn build_global_call_graphs(&mut self) -> Graph { + if self.function_call_graphs.is_none() { + let ret = self.load_and_deserialize(); + if ret.is_err() { + error!("Unable to load target data file - No functions to process.") + } + } + + let mut graph = Graph::::new(); + + for function in self.function_call_graphs.as_ref().unwrap().iter() { + let function_index_find = graph.node_indices().find(|i| &graph[*i] == &function.name); + let function_index = if function_index_find.is_none() { + graph.add_node(function.name.clone()) + } else { + function_index_find.unwrap() + }; + + debug!("Function Index Find: {:?} Function Index Used: {:?}", function_index_find, function_index); + + if function.imports.is_some() { + for import in function.imports.as_ref().unwrap().iter() { + if !self.include_unk && import.starts_with("unk.") { + debug!("Skipping {}", import); + continue + } else { + let import_index_find = graph.node_indices().find(|i| &graph[*i] == import); + if import_index_find.is_none() { + let import_index = graph.add_node(import.clone()); + graph.update_edge(function_index, import_index, 0); + } else { + graph.update_edge(function_index, import_index_find.unwrap(), 0); + } + } + } + } + } + + // Tidy up the generated call graph to account for when + // calling relationships may have not be recovered and + // we have orphan nodes + for node_idx in graph.node_indices() { + if graph.neighbors_directed(node_idx, Outgoing).collect_vec().len() + + graph.neighbors_directed(node_idx, Incoming).collect_vec().len() == 0 { + graph.remove_node(node_idx); + } + } + + graph } } @@ -474,3 +526,46 @@ impl TikNibFuncMetaFile { FunctionMetadataTypes::AGFJ(self.function_info.clone().unwrap()) } } + + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + use std::path::PathBuf; + use crate::files::AGCJFile; + + fn return_test_file_oject() -> AGCJFile { + let mut call_graph_file = AGCJFile { + filename: PathBuf::from("test-files/ls_cg.json"), + function_call_graphs: None, + output_path: PathBuf::new(), + function_metadata: None, + include_unk: false, + }; + + call_graph_file + .load_and_deserialize() + .expect("Failed to load data"); + call_graph_file + } + + #[test] + fn test_global_call_graph_generation() { + let mut call_graph_file = return_test_file_oject(); + + let global_call_graph = call_graph_file.build_global_call_graphs(); + + assert_eq!(global_call_graph.node_count(), 111); + + let mut node_names = Vec::new(); + + for node in global_call_graph.raw_nodes().iter() { + node_names.push(node.weight.clone()) + } + + let unique_node_names = node_names.iter() + .collect::>(); + + assert_eq!(node_names.len(), unique_node_names.len()); + } +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index bccc8a0..9b67323 100644 --- a/src/main.rs +++ b/src/main.rs @@ -65,6 +65,7 @@ enum DataType { OneHopCg, CgWithCallers, OneHopCgWithcallers, + GlobalCg, Invalid, } @@ -76,6 +77,7 @@ impl fmt::Display for DataType { DataType::CgWithCallers => write!(f, "Call Graph with Callers"), DataType::OneHopCg => write!(f, "One Hop Call Graph"), DataType::OneHopCgWithcallers => write!(f, "One Hop Call Graph with Callers"), + DataType::GlobalCg => write!(f, "Globlal Call Graph"), DataType::Invalid => write!(f, "Invalid"), } } @@ -97,7 +99,7 @@ enum GenerateSubCommands { path: PathBuf, /// The target data type - #[arg(short, long, value_name = "DATA_TYPE", value_parser = clap::builder::PossibleValuesParser::new(["cfg", "cg", "onehopcg", "cgcallers", "onehopcgcallers"]) + #[arg(short, long, value_name = "DATA_TYPE", value_parser = clap::builder::PossibleValuesParser::new(["cfg", "cg", "onehopcg", "cgcallers", "onehopcgcallers", "globalcg"]) .map(|s| s.parse::().unwrap()),)] data_type: String, @@ -411,6 +413,7 @@ fn main() { "onehopcg" => DataType::OneHopCg, "cgcallers" => DataType::CgWithCallers, "onehopcgcallers" => DataType::OneHopCgWithcallers, + "globalcg" => DataType::GlobalCg, _ => DataType::Invalid, }; @@ -500,6 +503,24 @@ fn main() { } else { error!("--feature-type/-f is required for creating CFG's") } + } else if graph_data_type == DataType::GlobalCg { + if Path::new(path).is_file() { + let mut file = AGCJFile { + filename: (*path).clone(), + function_call_graphs: None, + output_path: (*output_path).clone(), + function_metadata: None, + include_unk: *include_unk, + }; + file.load_and_deserialize() + .expect("Unable to load and desearilize JSON"); + let global_cg = file.build_global_call_graphs(); + println!("{:?}", global_cg); + } else { + todo!("Need to do this!"); + //let mut file_paths_vec = + // get_json_paths_from_dir(path, Some("_cg".to_string())); + } } else { // If its only one file if Path::new(path).is_file() { diff --git a/test-files/cg_dedup/deduped/.gitkeep b/test-files/cg_dedup/deduped/.gitkeep deleted file mode 100644 index e69de29..0000000 From 4def23a202de3f9edb81a280a4684a10028691ee Mon Sep 17 00:00:00 2001 From: br0kej Date: Sun, 3 Mar 2024 09:01:00 +0000 Subject: [PATCH 32/40] [feature/refactor] Adding support for save the generated global call graphs to networkx compat JSON + tests --- src/agcj.rs | 33 +++++++----- src/agfj.rs | 4 +- src/dedup.rs | 6 ++- src/extract.rs | 3 -- src/files.rs | 135 +++++++++++++++++++++++++++++++++++++++---------- src/main.rs | 31 ++++++++---- src/utils.rs | 29 +++++++++-- 7 files changed, 183 insertions(+), 58 deletions(-) diff --git a/src/agcj.rs b/src/agcj.rs index 35f5068..d1d983e 100644 --- a/src/agcj.rs +++ b/src/agcj.rs @@ -33,8 +33,12 @@ impl AGCJFunctionCallGraph { networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { - let mut full_output_path = - get_save_file_path(binary_name, output_path, Some(type_suffix.to_string())); + let mut full_output_path = get_save_file_path( + binary_name, + output_path, + Some(type_suffix.to_string()), + None, + ); check_or_create_dir(&full_output_path); let mut function_name = self.name.clone(); @@ -43,12 +47,9 @@ impl AGCJFunctionCallGraph { if function_name.chars().count() > 100 { function_name = self.name[..75].to_string(); } - - let filename = format!( - "{}-{}.json", - function_name, type_suffix - ); - + + let filename = format!("{}-{}.json", function_name, type_suffix); + // Normalise string for windows let filename = filename.replace(&['(', ')', ',', '\"', ';', ':', '\''][..], ""); full_output_path.push(filename); @@ -69,8 +70,12 @@ impl AGCJFunctionCallGraph { networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { - let full_output_path = - get_save_file_path(binary_name, output_path, Some(type_suffix.to_string())); + let full_output_path = get_save_file_path( + binary_name, + output_path, + Some(type_suffix.to_string()), + None, + ); check_or_create_dir(&full_output_path); let mut function_name = self.name.clone(); @@ -99,8 +104,12 @@ impl AGCJFunctionCallGraph { networkx_graph: NetworkxDiGraph, type_suffix: &str, ) { - let mut full_output_path = - get_save_file_path(binary_name, output_path, Some(type_suffix.to_string())); + let mut full_output_path = get_save_file_path( + binary_name, + output_path, + Some(type_suffix.to_string()), + None, + ); check_or_create_dir(&full_output_path); debug!("Built Path: {:?}", full_output_path); let mut function_name = self.name.clone(); diff --git a/src/agfj.rs b/src/agfj.rs index 7f10e2e..dbb5783 100644 --- a/src/agfj.rs +++ b/src/agfj.rs @@ -248,7 +248,7 @@ impl AGFJFunc { inference_job: &Option>, ) { info!("Processing {:?}", self.name); - let full_output_path = get_save_file_path(path, output_path, None); + let full_output_path = get_save_file_path(path, output_path, None, None); check_or_create_dir(&full_output_path); // offset != 1 has been added to skip functions with invalid instructions @@ -331,7 +331,7 @@ impl AGFJFunc { feature_type: FeatureType, architecture: &String, ) { - let full_output_path = get_save_file_path(path, output_path, None); + let full_output_path = get_save_file_path(path, output_path, None, None); check_or_create_dir(&full_output_path); let file_name = path.file_name().unwrap(); let binding = file_name.to_string_lossy().to_string(); diff --git a/src/dedup.rs b/src/dedup.rs index c69d124..11dd6e1 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -586,12 +586,14 @@ impl CGCorpus { } mod tests { + use crate::dedup::CGCorpus; + use crate::networkx::{ + CallGraphFuncWithMetadata, CallGraphNodeFeatureType, CallGraphTypes, NetworkxDiGraph, + }; use std::fs; use std::fs::read_to_string; use std::path::{Path, PathBuf}; use walkdir::WalkDir; - use crate::dedup::CGCorpus; - use crate::networkx::{CallGraphFuncWithMetadata, CallGraphNodeFeatureType, CallGraphTypes, NetworkxDiGraph}; // Test Dedup on typed CG's #[test] diff --git a/src/extract.rs b/src/extract.rs index 3d443cd..975d5ce 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -8,7 +8,6 @@ use anyhow::Result; use r2pipe::R2Pipe; use r2pipe::R2PipeSpawnOptions; - use serde::{Deserialize, Serialize}; use serde_aux::prelude::*; use serde_json; @@ -17,11 +16,9 @@ use serde_json::{json, Value}; use std::collections::HashMap; use std::env; - use std::fs; use std::fs::File; - use std::path::{Path, PathBuf}; use walkdir::WalkDir; diff --git a/src/files.rs b/src/files.rs index 91a94b4..e4443e9 100644 --- a/src/files.rs +++ b/src/files.rs @@ -6,9 +6,12 @@ use crate::consts::*; use crate::errors::FileLoadError; #[cfg(feature = "inference")] use crate::inference::InferenceJob; -use crate::utils::get_save_file_path; +use crate::networkx::{CallGraphFuncNameNode, NetworkxDiGraph}; +use crate::utils::{check_or_create_dir, get_save_file_path}; use enum_as_inner::EnumAsInner; use indicatif::ParallelProgressIterator; +use itertools::Itertools; +use petgraph::visit::IntoEdgesDirected; use petgraph::{Graph, Incoming, Outgoing}; use rayon::iter::ParallelIterator; use rayon::prelude::{IntoParallelRefIterator, IntoParallelRefMutIterator}; @@ -22,8 +25,6 @@ use std::string::String; use std::sync::mpsc::channel; #[cfg(feature = "inference")] use std::sync::Arc; -use itertools::Itertools; -use petgraph::visit::IntoEdgesDirected; #[cfg(feature = "inference")] use tch::nn::func; @@ -144,7 +145,8 @@ impl AGFJFile { /// It is *not* suitable for doing any other sort of tasks such as Next Sentence /// Prediction (NSP) as there is not indication of where a basic block starts or ends. pub fn generate_random_bb_walk(mut self, esil: bool, pairs: bool) { - let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); let fname_string = if esil { format!("{:?}-esil-singles-rwdfs.txt", fname_string) } else { @@ -194,7 +196,8 @@ impl AGFJFile { /// Generates a single string which contains the ESIL representation of every /// instruction within a function pub fn generate_esil_func_strings(mut self) { - let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); let fname_string = format!("{:?}-efs.json", fname_string); if !Path::new(&fname_string).exists() { @@ -234,7 +237,8 @@ impl AGFJFile { pub fn generate_disasm_func_strings(mut self) { // This needs to be amended so that there is a AGFJFunc function // that returns a function as a func string. - let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); let fname_string = format!("{:?}-dfs.json", fname_string); if !Path::new(&fname_string).exists() { @@ -260,7 +264,7 @@ impl AGFJFile { let json = json!(map); let fname_string: PathBuf = - get_save_file_path(&self.filename, &self.output_path, None); + get_save_file_path(&self.filename, &self.output_path, None, None); let fname_string = format!("{:?}-dfs.json", fname_string); serde_json::to_writer( @@ -278,7 +282,8 @@ impl AGFJFile { /// This ignores control flow and simple iterates the JSON objects from the top to /// the bottom. pub fn generate_linear_bb_walk(mut self, esil: bool) { - let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); let fname_string = if esil { format!("{:?}-esil-singles.txt", fname_string) } else { @@ -340,7 +345,8 @@ impl AGFJFile { } let json = json!(&func_feature_vectors); - let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); let fname_string = format!("{}-tiknib.json", fname_string.to_string_lossy()); serde_json::to_writer( &File::create(fname_string).expect("Failed to create writer"), @@ -401,7 +407,15 @@ impl AGCJFile { Ok(()) } - pub fn build_global_call_graphs(&mut self) -> Graph { + pub fn generate_global_call_graphs(&mut self) { + let call_graph = self.build_global_call_graph(); + println!("Num Nodes (Default): {}", call_graph.node_count()); + let cleaned_graph = self.post_process_graph(call_graph); + println!("Num Nodes (Post-Clean): {}", cleaned_graph.node_count()); + self.save_global_call_graph_to_json(cleaned_graph) + } + + fn build_global_call_graph(&mut self) -> Graph { if self.function_call_graphs.is_none() { let ret = self.load_and_deserialize(); if ret.is_err() { @@ -419,13 +433,16 @@ impl AGCJFile { function_index_find.unwrap() }; - debug!("Function Index Find: {:?} Function Index Used: {:?}", function_index_find, function_index); + debug!( + "Function Index Find: {:?} Function Index Used: {:?}", + function_index_find, function_index + ); if function.imports.is_some() { for import in function.imports.as_ref().unwrap().iter() { if !self.include_unk && import.starts_with("unk.") { debug!("Skipping {}", import); - continue + continue; } else { let import_index_find = graph.node_indices().find(|i| &graph[*i] == import); if import_index_find.is_none() { @@ -438,19 +455,53 @@ impl AGCJFile { } } } + graph + } + fn post_process_graph(&self, mut graph: Graph) -> Graph { // Tidy up the generated call graph to account for when - // calling relationships may have not be recovered and + // calling relationships may have not been recovered and // we have orphan nodes for node_idx in graph.node_indices() { - if graph.neighbors_directed(node_idx, Outgoing).collect_vec().len() + - graph.neighbors_directed(node_idx, Incoming).collect_vec().len() == 0 { + if graph + .neighbors_directed(node_idx, Outgoing) + .collect_vec() + .len() + + graph + .neighbors_directed(node_idx, Incoming) + .collect_vec() + .len() + == 0 + { graph.remove_node(node_idx); } } - graph } + fn save_global_call_graph_to_json(&self, graph: Graph) { + let networkx_graph = NetworkxDiGraph::from(graph); + + let mut full_output_path = get_save_file_path( + &self.filename, + &self.output_path, + Some("gcg".to_string()), + Some("_cg".to_string()), + ); + check_or_create_dir(&full_output_path); + + full_output_path.set_extension("json".to_string()); + + debug!( + "Attempting to save global call graph to: {:?}", + full_output_path + ); + + serde_json::to_writer( + &File::create(full_output_path).expect("Failed to create writer"), + &networkx_graph, + ) + .expect("Unable to write JSON"); + } } #[derive(Serialize, Deserialize, Debug)] @@ -493,7 +544,8 @@ impl AFIJFile { } pub fn subset_and_save(&mut self, extended: bool) { let func_info_subsets = self.subset(extended); - let fname_string: PathBuf = get_save_file_path(&self.filename, &self.output_path, None); + let fname_string: PathBuf = + get_save_file_path(&self.filename, &self.output_path, None, None); let filename = format!("{}-finfo-subset.json", fname_string.to_string_lossy()); serde_json::to_writer( &File::create(filename).expect("Failed to create writer"), @@ -527,16 +579,15 @@ impl TikNibFuncMetaFile { } } - #[cfg(test)] mod tests { + use crate::files::AGCJFile; use std::collections::HashSet; use std::path::PathBuf; - use crate::files::AGCJFile; - fn return_test_file_oject() -> AGCJFile { + fn return_test_file_oject(file_path: &str) -> AGCJFile { let mut call_graph_file = AGCJFile { - filename: PathBuf::from("test-files/ls_cg.json"), + filename: PathBuf::from(file_path), function_call_graphs: None, output_path: PathBuf::new(), function_metadata: None, @@ -551,9 +602,9 @@ mod tests { #[test] fn test_global_call_graph_generation() { - let mut call_graph_file = return_test_file_oject(); + let mut call_graph_file = return_test_file_oject("test-files/ls_cg.json"); - let global_call_graph = call_graph_file.build_global_call_graphs(); + let global_call_graph = call_graph_file.build_global_call_graph(); assert_eq!(global_call_graph.node_count(), 111); @@ -563,9 +614,41 @@ mod tests { node_names.push(node.weight.clone()) } - let unique_node_names = node_names.iter() - .collect::>(); + let unique_node_names = node_names.iter().collect::>(); + + assert_eq!(node_names.len(), unique_node_names.len()); + } + + #[test] + fn test_global_graph_with_redudent_nodes() { + let mut call_graph_file = return_test_file_oject("data-examples/raw/test_bin_cg.json"); + + let global_call_graph = call_graph_file.build_global_call_graph(); + + assert_eq!(global_call_graph.node_count(), 9); + + let mut node_names = Vec::new(); + + for node in global_call_graph.raw_nodes().iter() { + node_names.push(node.weight.clone()) + } + + let unique_node_names = node_names.iter().collect::>(); + + assert_eq!(node_names.len(), unique_node_names.len()); + + let post_processed_call_graph = call_graph_file.post_process_graph(global_call_graph); + + assert_eq!(post_processed_call_graph.node_count(), 8); + + let mut node_names = Vec::new(); + + for node in post_processed_call_graph.raw_nodes().iter() { + node_names.push(node.weight.clone()) + } + + let unique_node_names = node_names.iter().collect::>(); assert_eq!(node_names.len(), unique_node_names.len()); } -} \ No newline at end of file +} diff --git a/src/main.rs b/src/main.rs index 9b67323..b46f792 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,7 +42,7 @@ use crate::files::{AFIJFile, AGCJFile, FunctionMetadataTypes, TikNibFuncMetaFile use crate::tokeniser::{train_byte_bpe_tokeniser, TokeniserType}; use crate::utils::get_save_file_path; -use crate::networkx::CallGraphNodeFeatureType; +use crate::networkx::{CallGraphFuncNameNode, CallGraphNodeFeatureType, NetworkxDiGraph}; use bb::{FeatureType, InstructionMode}; #[cfg(feature = "goblin")] use binnfo::goblin_info; @@ -281,7 +281,7 @@ enum Commands { #[arg(long, default_value = "false")] extended_analysis: bool, - #[arg(long, default_value ="true")] + #[arg(long, default_value = "true")] use_curl_pdb: bool, }, /// Generate single embeddings on the fly @@ -504,6 +504,9 @@ fn main() { error!("--feature-type/-f is required for creating CFG's") } } else if graph_data_type == DataType::GlobalCg { + warn!("This functionality currently only supports Function name node types."); + // Need to add the functionality for adding metadata + // to the calls graphs as well as saving them if Path::new(path).is_file() { let mut file = AGCJFile { filename: (*path).clone(), @@ -514,12 +517,13 @@ fn main() { }; file.load_and_deserialize() .expect("Unable to load and desearilize JSON"); - let global_cg = file.build_global_call_graphs(); - println!("{:?}", global_cg); + info!( + "Generating and saving global call graph for: {}", + path.display() + ); + file.generate_global_call_graphs(); } else { - todo!("Need to do this!"); - //let mut file_paths_vec = - // get_json_paths_from_dir(path, Some("_cg".to_string())); + todo!("Parallel generation of Global Call Graphs is currently not implemented"); } } else { // If its only one file @@ -624,6 +628,7 @@ fn main() { &PathBuf::from(path), output_path, Some(suffix), + None, ); if !full_output_path.is_dir() { let mut file = AGCJFile { @@ -717,7 +722,7 @@ fn main() { combined_cgs_metadata.par_iter().progress().for_each(|(filepath, metapath)| { let suffix = format!("{}-meta", graph_type.to_owned()); - let full_output_path = get_save_file_path(&PathBuf::from(filepath), output_path, Some(suffix)); + let full_output_path = get_save_file_path(&PathBuf::from(filepath), output_path, Some(suffix), None); if !full_output_path.is_dir() { let mut file = { let metadata: Option; @@ -976,7 +981,15 @@ fn main() { use_curl_pdb, } => { info!("Creating extraction job"); - let job = ExtractionJob::new(fpath, output_dir, mode, debug, extended_analysis, use_curl_pdb).unwrap(); + let job = ExtractionJob::new( + fpath, + output_dir, + mode, + debug, + extended_analysis, + use_curl_pdb, + ) + .unwrap(); if job.input_path_type == PathType::Dir { info!("Directory found - will parallel process"); diff --git a/src/utils.rs b/src/utils.rs index a1cd0df..f50ca70 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -18,17 +18,25 @@ pub fn get_save_file_path( binary_path: &PathBuf, output_path: &PathBuf, optional_suffix: Option, + remove_suffix: Option, ) -> PathBuf { debug!( "Building Filepath - Binary Path: {:?} Output Path: {:?}", binary_path, output_path ); - let file_name = binary_path + let mut file_name = binary_path .file_stem() .unwrap() .to_string_lossy() .to_string(); + let file_name = if remove_suffix.is_some() { + let file_name = file_name.replace(&remove_suffix.unwrap(), ""); + file_name + } else { + file_name + }; + if optional_suffix.is_none() { let full_output_path = format!( "{}/{}", @@ -97,21 +105,34 @@ mod tests { fn test_get_save_file_path_1() { let path: &PathBuf = &PathBuf::from("test_bin/hello.json"); let output_path: &PathBuf = &PathBuf::from("processed_data/"); - let output_path = get_save_file_path(path, &output_path, Some("cg".to_string())); + let output_path = get_save_file_path(path, &output_path, Some("cg".to_string()), None); assert_eq!(output_path, PathBuf::from("processed_data/hello-cg")) } #[test] fn test_get_save_file_path_2() { let path: &PathBuf = &PathBuf::from("test_bin/extra_dir/hello.json"); let output_path: &PathBuf = &PathBuf::from("with_more/processed_data/"); - let output = get_save_file_path(path, output_path, None); + let output = get_save_file_path(path, output_path, None, None); assert_eq!(output, PathBuf::from("with_more/processed_data/hello")) } #[test] fn test_get_save_file_path_3() { let path: &PathBuf = &PathBuf::from("hello.json"); let output_path: &PathBuf = &PathBuf::from("processed_data"); - let output = get_save_file_path(path, &output_path, None); + let output = get_save_file_path(path, &output_path, None, None); assert_eq!(output, PathBuf::from("processed_data/hello")) } + + #[test] + fn test_get_save_file_path_with_suffix_removal() { + let path: &PathBuf = &PathBuf::from("hello_cg.json"); + let output_path: &PathBuf = &PathBuf::from("processed_data"); + let output = get_save_file_path( + path, + &output_path, + Some("gcg".to_string()), + Some("_cg".to_string()), + ); + assert_eq!(output, PathBuf::from("processed_data/hello-gcg")) + } } From 383f47fce949d9646011761da5e1c639a73ca3fd Mon Sep 17 00:00:00 2001 From: br0kej Date: Sun, 3 Mar 2024 09:41:08 +0000 Subject: [PATCH 33/40] applying clippy suggestions --- src/dedup.rs | 13 +++++++------ src/extract.rs | 10 +++------- src/files.rs | 26 ++++++++++++++------------ src/main.rs | 2 +- src/utils.rs | 8 ++++---- 5 files changed, 29 insertions(+), 30 deletions(-) diff --git a/src/dedup.rs b/src/dedup.rs index 11dd6e1..4c57ed2 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -234,7 +234,7 @@ impl EsilFuncStringCorpus { } /// Generate hash statistics from a func hash tuple collection - fn hash_stats(&self, original_len: usize, unique_func_has_tuples: &Vec) { + fn hash_stats(&self, original_len: usize, unique_func_has_tuples: &[DedupEntry]) { let unique_len = unique_func_has_tuples.len(); let percent_difference: f32 = ((original_len as f32 - unique_len as f32) / original_len as f32) * 100.0; @@ -363,7 +363,7 @@ impl CGCorpus { } } - fn dedup_corpus_inplace(data: &mut Vec>, filepaths: &mut Vec) { + fn dedup_corpus_inplace(data: &mut [Option], filepaths: &mut [PathBuf]) { let mut seen = HashSet::new(); for (i, data_ele) in data.iter().enumerate() { let hash_value = Self::calculate_hash(&data_ele); @@ -435,7 +435,7 @@ impl CGCorpus { unique_binaries_fps } - fn load_subset(&self, fp_subset: &Vec) -> Vec> { + fn load_subset(&self, fp_subset: &[PathBuf]) -> Vec> { let mut subset_loaded_data = Vec::new(); for ele in fp_subset.iter() { let data = read_to_string(ele).expect(&format!("Unable to read file - {:?}", ele)); @@ -553,10 +553,10 @@ impl CGCorpus { } } - fn generate_dedup_filepath(output_path: &PathBuf, filepath: &PathBuf) -> PathBuf { + fn generate_dedup_filepath(output_path: &Path, filepath: &Path) -> PathBuf { let first_two = filepath.components().rev().take(2).collect::>(); let first_two: PathBuf = first_two.iter().rev().collect(); - let output = output_path.clone(); + let output = output_path.to_path_buf(); let mut final_path = PathBuf::new(); final_path.push(output); final_path.push(first_two); @@ -566,7 +566,7 @@ impl CGCorpus { pub fn save_corpus( &self, subset_loaded_data: Vec, - fp_subset: &mut Vec, + fp_subset: &mut [PathBuf], ) { subset_loaded_data .iter() @@ -585,6 +585,7 @@ impl CGCorpus { } } +#[cfg(test)] mod tests { use crate::dedup::CGCorpus; use crate::networkx::{ diff --git a/src/extract.rs b/src/extract.rs index 975d5ce..2fafed0 100644 --- a/src/extract.rs +++ b/src/extract.rs @@ -487,13 +487,9 @@ impl FileToBeProcessed { info!("Getting function information from binary"); let json = r2p.cmd("aflj"); - if json.is_ok() { - let json_obj: Vec = serde_json::from_str(json.as_ref().unwrap()) - .expect(&format!( - "Unable to convert to JSON object! - {}", - json.unwrap() - )); - + if let Ok(json_str) = json { + let json_obj: Vec = serde_json::from_str(json_str.as_ref()) + .expect("Unable to convert to JSON object!"); Ok(json_obj) } else { Err(json.unwrap_err()) diff --git a/src/files.rs b/src/files.rs index e4443e9..d0d1d90 100644 --- a/src/files.rs +++ b/src/files.rs @@ -6,12 +6,12 @@ use crate::consts::*; use crate::errors::FileLoadError; #[cfg(feature = "inference")] use crate::inference::InferenceJob; -use crate::networkx::{CallGraphFuncNameNode, NetworkxDiGraph}; +use crate::networkx::{NetworkxDiGraph}; use crate::utils::{check_or_create_dir, get_save_file_path}; use enum_as_inner::EnumAsInner; use indicatif::ParallelProgressIterator; use itertools::Itertools; -use petgraph::visit::IntoEdgesDirected; + use petgraph::{Graph, Incoming, Outgoing}; use rayon::iter::ParallelIterator; use rayon::prelude::{IntoParallelRefIterator, IntoParallelRefMutIterator}; @@ -426,11 +426,12 @@ impl AGCJFile { let mut graph = Graph::::new(); for function in self.function_call_graphs.as_ref().unwrap().iter() { - let function_index_find = graph.node_indices().find(|i| &graph[*i] == &function.name); - let function_index = if function_index_find.is_none() { - graph.add_node(function.name.clone()) + let function_index_find = graph.node_indices().find(|i| graph[*i] == function.name); + + let function_index = if let Some(index) = function_index_find { + index } else { - function_index_find.unwrap() + graph.add_node(function.name.clone()) }; debug!( @@ -445,12 +446,13 @@ impl AGCJFile { continue; } else { let import_index_find = graph.node_indices().find(|i| &graph[*i] == import); - if import_index_find.is_none() { - let import_index = graph.add_node(import.clone()); - graph.update_edge(function_index, import_index, 0); + let import_index = if let Some(index) = import_index_find { + index } else { - graph.update_edge(function_index, import_index_find.unwrap(), 0); - } + graph.add_node(import.clone()) + }; + + graph.update_edge(function_index, import_index, 0); } } } @@ -489,7 +491,7 @@ impl AGCJFile { ); check_or_create_dir(&full_output_path); - full_output_path.set_extension("json".to_string()); + full_output_path.set_extension("json"); debug!( "Attempting to save global call graph to: {:?}", diff --git a/src/main.rs b/src/main.rs index b46f792..d2dba73 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,7 +42,7 @@ use crate::files::{AFIJFile, AGCJFile, FunctionMetadataTypes, TikNibFuncMetaFile use crate::tokeniser::{train_byte_bpe_tokeniser, TokeniserType}; use crate::utils::get_save_file_path; -use crate::networkx::{CallGraphFuncNameNode, CallGraphNodeFeatureType, NetworkxDiGraph}; +use crate::networkx::{CallGraphNodeFeatureType}; use bb::{FeatureType, InstructionMode}; #[cfg(feature = "goblin")] use binnfo::goblin_info; diff --git a/src/utils.rs b/src/utils.rs index f50ca70..495dcbc 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -24,19 +24,19 @@ pub fn get_save_file_path( "Building Filepath - Binary Path: {:?} Output Path: {:?}", binary_path, output_path ); - let mut file_name = binary_path + let file_name = binary_path .file_stem() .unwrap() .to_string_lossy() .to_string(); - let file_name = if remove_suffix.is_some() { - let file_name = file_name.replace(&remove_suffix.unwrap(), ""); - file_name + let file_name = if let Some(suffix) = remove_suffix { + file_name.replace(&suffix, "") } else { file_name }; + if optional_suffix.is_none() { let full_output_path = format!( "{}/{}", From cab778fd2ca560988d7fcd6186ce6586f5cec855 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 3 Mar 2024 20:45:20 +0000 Subject: [PATCH 34/40] [refactor] simplfying call graph related cli functionality --- src/files.rs | 58 +++++++++- src/main.rs | 306 +++++++++++++++++---------------------------------- 2 files changed, 160 insertions(+), 204 deletions(-) diff --git a/src/files.rs b/src/files.rs index d0d1d90..63eabf5 100644 --- a/src/files.rs +++ b/src/files.rs @@ -6,12 +6,13 @@ use crate::consts::*; use crate::errors::FileLoadError; #[cfg(feature = "inference")] use crate::inference::InferenceJob; -use crate::networkx::{NetworkxDiGraph}; +use crate::networkx::NetworkxDiGraph; use crate::utils::{check_or_create_dir, get_save_file_path}; use enum_as_inner::EnumAsInner; use indicatif::ParallelProgressIterator; use itertools::Itertools; +use crate::DataType; use petgraph::{Graph, Incoming, Outgoing}; use rayon::iter::ParallelIterator; use rayon::prelude::{IntoParallelRefIterator, IntoParallelRefMutIterator}; @@ -407,6 +408,7 @@ impl AGCJFile { Ok(()) } + // Global Call Graph Related Functions pub fn generate_global_call_graphs(&mut self) { let call_graph = self.build_global_call_graph(); println!("Num Nodes (Default): {}", call_graph.node_count()); @@ -504,6 +506,60 @@ impl AGCJFile { ) .expect("Unable to write JSON"); } + + // Local Call Graph Helper Functions + pub fn process_based_on_graph_data_type( + &self, + graph_data_type: DataType, + with_features: &bool, + metadata_type: Option, + ) { + for fcg in self.function_call_graphs.as_ref().unwrap() { + match graph_data_type { + DataType::Cg => { + fcg.to_petgraph( + &self, + &self.output_path, + &self.filename, + with_features, + &self.include_unk, + metadata_type.clone(), + ); + } + DataType::OneHopCg => { + fcg.one_hop_to_petgraph( + &self, + &self.output_path, + &self.filename, + with_features, + &self.include_unk, + metadata_type.clone(), + ); + } + DataType::CgWithCallers => { + fcg.to_petgraph_with_callers( + &self, + &self.output_path, + &self.filename, + with_features, + &self.include_unk, + metadata_type.clone(), + ); + } + DataType::OneHopCgWithcallers => { + fcg.one_hop_to_petgraph_with_callers( + &self, + &self.output_path, + &self.filename, + with_features, + &self.include_unk, + metadata_type.clone(), + ); + } + _ => unreachable!("Not possible hopefully! :O"), + } + } + } } #[derive(Serialize, Deserialize, Debug)] diff --git a/src/main.rs b/src/main.rs index d2dba73..afb8143 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,7 +42,7 @@ use crate::files::{AFIJFile, AGCJFile, FunctionMetadataTypes, TikNibFuncMetaFile use crate::tokeniser::{train_byte_bpe_tokeniser, TokeniserType}; use crate::utils::get_save_file_path; -use crate::networkx::{CallGraphNodeFeatureType}; +use crate::networkx::CallGraphNodeFeatureType; use bb::{FeatureType, InstructionMode}; #[cfg(feature = "goblin")] use binnfo::goblin_info; @@ -58,8 +58,8 @@ use utils::get_json_paths_from_dir; #[global_allocator] static GLOBAL: MiMalloc = MiMalloc; -#[derive(PartialEq)] -enum DataType { +#[derive(PartialEq, Copy, Clone)] +pub enum DataType { Cfg, Cg, OneHopCg, @@ -526,85 +526,43 @@ fn main() { todo!("Parallel generation of Global Call Graphs is currently not implemented"); } } else { - // If its only one file if Path::new(path).is_file() { - let mut file = if *with_features { - if metadata_path.is_none() { - error!("with features active - require --metadata-path argument"); - exit(1) - }; - let mut metadata = AFIJFile { - filename: metadata_path.clone().unwrap().to_path_buf(), - function_info: None, - output_path: PathBuf::new(), - }; - metadata - .load_and_deserialize() - .expect("Unable to load file"); - let metadata_subset = metadata.subset(false); - AGCJFile { - filename: (*path).clone(), - function_call_graphs: None, - output_path: (*output_path).clone(), - function_metadata: Some(metadata_subset), - include_unk: *include_unk, + let mut file = match with_features { + true => { + let mut metadata = AFIJFile { + filename: metadata_path.as_ref().unwrap().to_path_buf(), + function_info: None, + output_path: PathBuf::new(), + }; + metadata + .load_and_deserialize() + .expect("Unable to load file"); + let metadata_subset = metadata.subset(false); + AGCJFile { + filename: path.clone(), + function_call_graphs: None, + output_path: output_path.clone(), + function_metadata: Some(metadata_subset), + include_unk: *include_unk, + } } - } else { - AGCJFile { - filename: (*path).clone(), + false => AGCJFile { + filename: path.clone(), function_call_graphs: None, - output_path: (*output_path).clone(), + output_path: output_path.clone(), function_metadata: None, include_unk: *include_unk, - } + }, }; + file.load_and_deserialize() .expect("Unable to load and desearilize JSON"); - if graph_data_type == DataType::Cg { - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone(), - ); - } - } else if graph_data_type == DataType::OneHopCg { - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.one_hop_to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone(), - ); - } - } else if graph_data_type == DataType::CgWithCallers { - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone(), - ); - } - } else if graph_data_type == DataType::OneHopCgWithcallers { - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.one_hop_to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone(), - ); - } - } + + file.process_based_on_graph_data_type( + graph_data_type, + with_features, + metadata_type.clone(), + ) } else { debug!("Multiple files found"); @@ -641,52 +599,11 @@ fn main() { debug!("Proceissing {:?}", file.filename); file.load_and_deserialize() .expect("Unable to load and desearilize JSON"); - - for fcg in file.function_call_graphs.as_ref().unwrap() { - match graph_data_type { - DataType::Cg => { - fcg.to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - None, - ); - } - DataType::OneHopCg => { - fcg.one_hop_to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - None, - ); - } - DataType::CgWithCallers => { - fcg.to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - None, - ); - } - DataType::OneHopCgWithcallers => { - fcg.one_hop_to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - None, - ); - } - _ => unreachable!("Not possible hopefully! :O"), - } - } + file.process_based_on_graph_data_type( + graph_data_type, + with_features, + metadata_type.clone(), + ); } else { info!( "Skipping {} as already exists", @@ -720,94 +637,76 @@ fn main() { .zip(metadata_paths_vec) .collect::>(); - combined_cgs_metadata.par_iter().progress().for_each(|(filepath, metapath)| { - let suffix = format!("{}-meta", graph_type.to_owned()); - let full_output_path = get_save_file_path(&PathBuf::from(filepath), output_path, Some(suffix), None); - if !full_output_path.is_dir() { - let mut file = { - let metadata: Option; - if metadata_type.clone().unwrap() == *"finfo" { - let mut metadata_file = AFIJFile { - filename: PathBuf::from(metapath), - function_info: None, - output_path: PathBuf::new(), - }; - debug!("Attempting to load metadata file: {}", metapath); - metadata_file - .load_and_deserialize() - .expect("Unable to load associated metadata file"); - metadata = Some(metadata_file.subset(false)); - } else if metadata_type.clone().unwrap() == *"tiknib" { - let mut metadata_file = TikNibFuncMetaFile { - filename: PathBuf::from(metapath), - function_info: None, - output_path: PathBuf::new(), - }; - - metadata_file.load_and_deserialize().expect("Unable to load associated metadata file"); - metadata = Some(metadata_file.subset()); - } else { - metadata = None - } - - AGCJFile { - filename: PathBuf::from(filepath), - function_call_graphs: None, - output_path: output_path.to_owned(), - function_metadata: metadata, - include_unk: *include_unk, - } - }; - debug!("Attempting to load {:?}", file.filename); - file.load_and_deserialize() - .expect("Unable to load and desearilize JSON"); + combined_cgs_metadata.par_iter().progress().for_each( + |(filepath, metapath)| { + let suffix = format!("{}-meta", graph_type.to_owned()); + let full_output_path = get_save_file_path( + &PathBuf::from(filepath), + output_path, + Some(suffix), + None, + ); + if !full_output_path.is_dir() { + let mut file = { + let metadata: Option; + if metadata_type.clone().unwrap() == *"finfo" { + let mut metadata_file = AFIJFile { + filename: PathBuf::from(metapath), + function_info: None, + output_path: PathBuf::new(), + }; + debug!( + "Attempting to load metadata file: {}", + metapath + ); + metadata_file.load_and_deserialize().expect( + "Unable to load associated metadata file", + ); + metadata = Some(metadata_file.subset(false)); + } else if metadata_type.clone().unwrap() == *"tiknib" { + let mut metadata_file = TikNibFuncMetaFile { + filename: PathBuf::from(metapath), + function_info: None, + output_path: PathBuf::new(), + }; + + metadata_file.load_and_deserialize().expect( + "Unable to load associated metadata file", + ); + metadata = Some(metadata_file.subset()); + } else { + metadata = None + } - if graph_data_type == DataType::Cg { - debug!("Generating call graphs using loaded cgs + metadata"); - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.to_petgraph( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone() - ); - } - } else if graph_data_type == DataType::OneHopCg { - debug!("Generating one hop call graphs using loaded cgs + metadata"); - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.one_hop_to_petgraph(&file, &file.output_path, &file.filename, with_features, &file.include_unk, metadata_type.clone()); - } - } else if graph_data_type == DataType::CgWithCallers { - debug!("Generating call graphs with callers using loaded cgs + metadata"); - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, + AGCJFile { + filename: PathBuf::from(filepath), + function_call_graphs: None, + output_path: output_path.to_owned(), + function_metadata: metadata, + include_unk: *include_unk, + } + }; + debug!("Attempting to load {:?}", file.filename); + file.load_and_deserialize() + .expect("Unable to load and desearilize JSON"); + + file.process_based_on_graph_data_type( + graph_data_type, with_features, - &file.include_unk, - metadata_type.clone() + metadata_type.clone(), ); - } - } else if graph_data_type == DataType::OneHopCgWithcallers { - debug!("Generating one hop call graphs with callers using loaded cgs + metadata"); - for fcg in file.function_call_graphs.as_ref().unwrap() { - fcg.one_hop_to_petgraph_with_callers( - &file, - &file.output_path, - &file.filename, - with_features, - &file.include_unk, - metadata_type.clone() + debug!( + "Finished generating cgs + metadata for {:?}", + file.filename ); + } else { + info!( + "Skipping {} as already exists", + full_output_path.to_string_lossy() + ) } - } - debug!("Finished generating cgs + metadata for {:?}", file.filename); - } else { - info!("Skipping {} as already exists", full_output_path.to_string_lossy()) - }}); + }, + ); } } } @@ -872,6 +771,7 @@ fn main() { output_path: _, combo_type, } => { + warn!("This feature is experimental and should be used with caution!"); if combo_type == "finfo+tiknib" { let _finfo_paths = get_json_paths_from_dir(input_path, Some("_finfo".to_string())); From b27b28380e749b442bcd705829c6a152d0f9c3b5 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 3 Mar 2024 20:48:56 +0000 Subject: [PATCH 35/40] prints into debug entries --- src/files.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/files.rs b/src/files.rs index 63eabf5..7765a53 100644 --- a/src/files.rs +++ b/src/files.rs @@ -411,9 +411,9 @@ impl AGCJFile { // Global Call Graph Related Functions pub fn generate_global_call_graphs(&mut self) { let call_graph = self.build_global_call_graph(); - println!("Num Nodes (Default): {}", call_graph.node_count()); + debug!("Num Nodes (Default): {}", call_graph.node_count()); let cleaned_graph = self.post_process_graph(call_graph); - println!("Num Nodes (Post-Clean): {}", cleaned_graph.node_count()); + debug!("Num Nodes (Post-Clean): {}", cleaned_graph.node_count()); self.save_global_call_graph_to_json(cleaned_graph) } From 337a3aa4b7b9274f0cb311fc99c9df49a54aa5fd Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 3 Mar 2024 21:04:48 +0000 Subject: [PATCH 36/40] [refactor] more refactoring to simplfy logic around graph generation --- src/files.rs | 23 +++++++++++++++++++++-- src/main.rs | 32 +++++--------------------------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/src/files.rs b/src/files.rs index 7765a53..b56b011 100644 --- a/src/files.rs +++ b/src/files.rs @@ -491,7 +491,6 @@ impl AGCJFile { Some("gcg".to_string()), Some("_cg".to_string()), ); - check_or_create_dir(&full_output_path); full_output_path.set_extension("json"); @@ -508,7 +507,7 @@ impl AGCJFile { } // Local Call Graph Helper Functions - pub fn process_based_on_graph_data_type( + fn process_function_level_cg( &self, graph_data_type: DataType, with_features: &bool, @@ -560,6 +559,26 @@ impl AGCJFile { } } } + + pub fn process_based_on_graph_data_type( + &mut self, + graph_data_type: DataType, + with_features: &bool, + metadata_type: Option, + ) { + match graph_data_type { + DataType::GlobalCg => self.generate_global_call_graphs(), + DataType::Cg + | DataType::OneHopCg + | DataType::OneHopCgWithcallers + | DataType::CgWithCallers => self.process_function_level_cg( + graph_data_type, + with_features, + metadata_type.clone(), + ), + _ => unreachable!("Unreachable!"), + } + } } #[derive(Serialize, Deserialize, Debug)] diff --git a/src/main.rs b/src/main.rs index afb8143..625fc65 100644 --- a/src/main.rs +++ b/src/main.rs @@ -484,7 +484,7 @@ fn main() { #[cfg(feature = "inference")] if feature_vec_type == FeatureType::ModelEmbedded { if tokeniser_fp.is_none() || model_fp.is_none() { - println!("Both Tokeniser and Model filespaths are needed"); + println!("Both Tokenizer and Model file paths are needed"); exit(100) } else { agfj_graph_embedded_feats( @@ -503,28 +503,6 @@ fn main() { } else { error!("--feature-type/-f is required for creating CFG's") } - } else if graph_data_type == DataType::GlobalCg { - warn!("This functionality currently only supports Function name node types."); - // Need to add the functionality for adding metadata - // to the calls graphs as well as saving them - if Path::new(path).is_file() { - let mut file = AGCJFile { - filename: (*path).clone(), - function_call_graphs: None, - output_path: (*output_path).clone(), - function_metadata: None, - include_unk: *include_unk, - }; - file.load_and_deserialize() - .expect("Unable to load and desearilize JSON"); - info!( - "Generating and saving global call graph for: {}", - path.display() - ); - file.generate_global_call_graphs(); - } else { - todo!("Parallel generation of Global Call Graphs is currently not implemented"); - } } else { if Path::new(path).is_file() { let mut file = match with_features { @@ -534,6 +512,7 @@ fn main() { function_info: None, output_path: PathBuf::new(), }; + debug!("AFIJ Object: {:?}", metadata); metadata .load_and_deserialize() .expect("Unable to load file"); @@ -557,12 +536,11 @@ fn main() { file.load_and_deserialize() .expect("Unable to load and desearilize JSON"); - file.process_based_on_graph_data_type( graph_data_type, with_features, metadata_type.clone(), - ) + ); } else { debug!("Multiple files found"); @@ -596,9 +574,9 @@ fn main() { function_metadata: None, include_unk: *include_unk, }; - debug!("Proceissing {:?}", file.filename); + debug!("Processing {:?}", file.filename); file.load_and_deserialize() - .expect("Unable to load and desearilize JSON"); + .expect("Unable to load and deserialize JSON"); file.process_based_on_graph_data_type( graph_data_type, with_features, From 4b6e1c640c0773fb91f7d9d94193e5dc16e7a760 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 3 Mar 2024 21:34:11 +0000 Subject: [PATCH 37/40] [feature] adding support for adding node features to global call graph generation --- src/files.rs | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/src/files.rs b/src/files.rs index b56b011..0ee7c4d 100644 --- a/src/files.rs +++ b/src/files.rs @@ -6,7 +6,10 @@ use crate::consts::*; use crate::errors::FileLoadError; #[cfg(feature = "inference")] use crate::inference::InferenceJob; -use crate::networkx::NetworkxDiGraph; +use crate::networkx::{ + CallGraphFuncWithMetadata, CallGraphNodeFeatureType, CallGraphTikNibFeatures, CallGraphTypes, + NetworkxDiGraph, +}; use crate::utils::{check_or_create_dir, get_save_file_path}; use enum_as_inner::EnumAsInner; use indicatif::ParallelProgressIterator; @@ -409,12 +412,12 @@ impl AGCJFile { } // Global Call Graph Related Functions - pub fn generate_global_call_graphs(&mut self) { + pub fn generate_global_call_graphs(&mut self, metadata_type: Option) { let call_graph = self.build_global_call_graph(); debug!("Num Nodes (Default): {}", call_graph.node_count()); let cleaned_graph = self.post_process_graph(call_graph); debug!("Num Nodes (Post-Clean): {}", cleaned_graph.node_count()); - self.save_global_call_graph_to_json(cleaned_graph) + self.save_global_call_graph_to_json(cleaned_graph, metadata_type) } fn build_global_call_graph(&mut self) -> Graph { @@ -482,8 +485,40 @@ impl AGCJFile { } graph } - fn save_global_call_graph_to_json(&self, graph: Graph) { - let networkx_graph = NetworkxDiGraph::from(graph); + + fn add_node_features_to_global_call_graph( + &self, + graph: Graph, + metadata_type: Option, + ) -> CallGraphTypes { + match metadata_type.unwrap().as_str() { + "finfo" => { + let networkx_graph = NetworkxDiGraph::::from(( + graph, + self.function_metadata.as_ref().unwrap().as_afij().unwrap(), + )); + CallGraphTypes::CGMeta(networkx_graph) + } + "tiknib" => { + let networkx_graph = NetworkxDiGraph::::from(( + graph, + self.function_metadata.as_ref().unwrap().as_agfj().unwrap(), + )); + CallGraphTypes::TikNib(networkx_graph) + } + _ => unreachable!("Impossible :D"), + } + } + fn save_global_call_graph_to_json( + &self, + graph: Graph, + metadata_type: Option, + ) { + let networkx_graph = if metadata_type.is_some() { + self.add_node_features_to_global_call_graph(graph, metadata_type) + } else { + CallGraphTypes::CGName(NetworkxDiGraph::from(graph)) + }; let mut full_output_path = get_save_file_path( &self.filename, @@ -567,7 +602,7 @@ impl AGCJFile { metadata_type: Option, ) { match graph_data_type { - DataType::GlobalCg => self.generate_global_call_graphs(), + DataType::GlobalCg => self.generate_global_call_graphs(metadata_type.clone()), DataType::Cg | DataType::OneHopCg | DataType::OneHopCgWithcallers From ba2a770db15897462ffc20a331ebc477d1058d4e Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 3 Mar 2024 21:37:01 +0000 Subject: [PATCH 38/40] typos --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 625fc65..b5ae575 100644 --- a/src/main.rs +++ b/src/main.rs @@ -666,7 +666,7 @@ fn main() { }; debug!("Attempting to load {:?}", file.filename); file.load_and_deserialize() - .expect("Unable to load and desearilize JSON"); + .expect("Unable to load and deserialize JSON"); file.process_based_on_graph_data_type( graph_data_type, @@ -872,7 +872,7 @@ fn main() { if job.input_path_type == PathType::Dir { info!("Directory found - will parallel process"); - info!("Creating threadpool with {} threads ", num_threads); + info!("Creating thread pool with {} threads ", num_threads); rayon::ThreadPoolBuilder::new() .num_threads(*num_threads) .build_global() From 3a7b7580cf725e4653c3de41dd9510fb99fe6f0f Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 3 Mar 2024 21:42:14 +0000 Subject: [PATCH 39/40] another typo --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index b5ae575..69e49b5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -535,7 +535,7 @@ fn main() { }; file.load_and_deserialize() - .expect("Unable to load and desearilize JSON"); + .expect("Unable to load and deserialize JSON"); file.process_based_on_graph_data_type( graph_data_type, with_features, From 51984aebf137772bdc2bf528ad187c02387b3240 Mon Sep 17 00:00:00 2001 From: Br0kej Date: Sun, 3 Mar 2024 21:45:12 +0000 Subject: [PATCH 40/40] [tidy] applying clippy suggestions --- src/files.rs | 13 +-- src/main.rs | 301 +++++++++++++++++++++++++-------------------------- 2 files changed, 155 insertions(+), 159 deletions(-) diff --git a/src/files.rs b/src/files.rs index 0ee7c4d..1cdc209 100644 --- a/src/files.rs +++ b/src/files.rs @@ -7,10 +7,9 @@ use crate::errors::FileLoadError; #[cfg(feature = "inference")] use crate::inference::InferenceJob; use crate::networkx::{ - CallGraphFuncWithMetadata, CallGraphNodeFeatureType, CallGraphTikNibFeatures, CallGraphTypes, - NetworkxDiGraph, + CallGraphFuncWithMetadata, CallGraphTikNibFeatures, CallGraphTypes, NetworkxDiGraph, }; -use crate::utils::{check_or_create_dir, get_save_file_path}; +use crate::utils::get_save_file_path; use enum_as_inner::EnumAsInner; use indicatif::ParallelProgressIterator; use itertools::Itertools; @@ -552,7 +551,7 @@ impl AGCJFile { match graph_data_type { DataType::Cg => { fcg.to_petgraph( - &self, + self, &self.output_path, &self.filename, with_features, @@ -562,7 +561,7 @@ impl AGCJFile { } DataType::OneHopCg => { fcg.one_hop_to_petgraph( - &self, + self, &self.output_path, &self.filename, with_features, @@ -572,7 +571,7 @@ impl AGCJFile { } DataType::CgWithCallers => { fcg.to_petgraph_with_callers( - &self, + self, &self.output_path, &self.filename, with_features, @@ -582,7 +581,7 @@ impl AGCJFile { } DataType::OneHopCgWithcallers => { fcg.one_hop_to_petgraph_with_callers( - &self, + self, &self.output_path, &self.filename, with_features, diff --git a/src/main.rs b/src/main.rs index 69e49b5..2cd86cc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -503,189 +503,186 @@ fn main() { } else { error!("--feature-type/-f is required for creating CFG's") } - } else { - if Path::new(path).is_file() { - let mut file = match with_features { - true => { - let mut metadata = AFIJFile { - filename: metadata_path.as_ref().unwrap().to_path_buf(), - function_info: None, - output_path: PathBuf::new(), - }; - debug!("AFIJ Object: {:?}", metadata); - metadata - .load_and_deserialize() - .expect("Unable to load file"); - let metadata_subset = metadata.subset(false); - AGCJFile { - filename: path.clone(), - function_call_graphs: None, - output_path: output_path.clone(), - function_metadata: Some(metadata_subset), - include_unk: *include_unk, - } - } - false => AGCJFile { + } else if Path::new(path).is_file() { + let mut file = match with_features { + true => { + let mut metadata = AFIJFile { + filename: metadata_path.as_ref().unwrap().to_path_buf(), + function_info: None, + output_path: PathBuf::new(), + }; + debug!("AFIJ Object: {:?}", metadata); + metadata + .load_and_deserialize() + .expect("Unable to load file"); + let metadata_subset = metadata.subset(false); + AGCJFile { filename: path.clone(), function_call_graphs: None, output_path: output_path.clone(), - function_metadata: None, + function_metadata: Some(metadata_subset), include_unk: *include_unk, - }, - }; + } + } + false => AGCJFile { + filename: path.clone(), + function_call_graphs: None, + output_path: output_path.clone(), + function_metadata: None, + include_unk: *include_unk, + }, + }; - file.load_and_deserialize() - .expect("Unable to load and deserialize JSON"); - file.process_based_on_graph_data_type( - graph_data_type, - with_features, - metadata_type.clone(), - ); - } else { - debug!("Multiple files found"); + file.load_and_deserialize() + .expect("Unable to load and deserialize JSON"); + file.process_based_on_graph_data_type( + graph_data_type, + with_features, + metadata_type.clone(), + ); + } else { + debug!("Multiple files found"); - if metadata_path.is_none() & with_features { + if metadata_path.is_none() & with_features { + error!("with features active - require --metadata-path argument"); + exit(1) + }; + + let mut file_paths_vec = get_json_paths_from_dir(path, Some("_cg".to_string())); + info!( + "{} files found. Beginning Processing.", + file_paths_vec.len() + ); + // if without metadata + if !with_features & metadata_type.is_none() { + debug!("Creating call graphs without any node features"); + file_paths_vec.par_iter().progress().for_each(|path| { + let suffix = graph_type.to_owned().to_string(); + let full_output_path = get_save_file_path( + &PathBuf::from(path), + output_path, + Some(suffix), + None, + ); + if !full_output_path.is_dir() { + let mut file = AGCJFile { + filename: path.to_owned().parse().unwrap(), + function_call_graphs: None, + output_path: output_path.to_owned(), + function_metadata: None, + include_unk: *include_unk, + }; + debug!("Processing {:?}", file.filename); + file.load_and_deserialize() + .expect("Unable to load and deserialize JSON"); + file.process_based_on_graph_data_type( + graph_data_type, + with_features, + metadata_type.clone(), + ); + } else { + info!( + "Skipping {} as already exists", + full_output_path.to_string_lossy() + ) + } + }) + } else { + debug!("Creating call graphs with node features"); + debug!("Getting metadata file paths"); + // its more than one file + if metadata_path.is_none() { error!("with features active - require --metadata-path argument"); exit(1) }; - let mut file_paths_vec = - get_json_paths_from_dir(path, Some("_cg".to_string())); - info!( - "{} files found. Beginning Processing.", - file_paths_vec.len() + if with_features & metadata_type.is_none() { + error!("with features requires metadata_type to be set") + } + let mut metadata_paths_vec = get_json_paths_from_dir( + metadata_path.as_ref().unwrap(), + Some(metadata_type.as_ref().unwrap().to_string()), ); - // if without metadata - if !with_features & metadata_type.is_none() { - debug!("Creating call graphs without any node features"); - file_paths_vec.par_iter().progress().for_each(|path| { - let suffix = graph_type.to_owned().to_string(); + + file_paths_vec.sort(); + metadata_paths_vec.sort(); + + assert_eq!(file_paths_vec.len(), metadata_paths_vec.len()); + let combined_cgs_metadata = file_paths_vec + .into_iter() + .zip(metadata_paths_vec) + .collect::>(); + + combined_cgs_metadata.par_iter().progress().for_each( + |(filepath, metapath)| { + let suffix = format!("{}-meta", graph_type.to_owned()); let full_output_path = get_save_file_path( - &PathBuf::from(path), + &PathBuf::from(filepath), output_path, Some(suffix), None, ); if !full_output_path.is_dir() { - let mut file = AGCJFile { - filename: path.to_owned().parse().unwrap(), - function_call_graphs: None, - output_path: output_path.to_owned(), - function_metadata: None, - include_unk: *include_unk, + let mut file = { + let metadata: Option; + if metadata_type.clone().unwrap() == *"finfo" { + let mut metadata_file = AFIJFile { + filename: PathBuf::from(metapath), + function_info: None, + output_path: PathBuf::new(), + }; + debug!( + "Attempting to load metadata file: {}", + metapath + ); + metadata_file + .load_and_deserialize() + .expect("Unable to load associated metadata file"); + metadata = Some(metadata_file.subset(false)); + } else if metadata_type.clone().unwrap() == *"tiknib" { + let mut metadata_file = TikNibFuncMetaFile { + filename: PathBuf::from(metapath), + function_info: None, + output_path: PathBuf::new(), + }; + + metadata_file + .load_and_deserialize() + .expect("Unable to load associated metadata file"); + metadata = Some(metadata_file.subset()); + } else { + metadata = None + } + + AGCJFile { + filename: PathBuf::from(filepath), + function_call_graphs: None, + output_path: output_path.to_owned(), + function_metadata: metadata, + include_unk: *include_unk, + } }; - debug!("Processing {:?}", file.filename); + debug!("Attempting to load {:?}", file.filename); file.load_and_deserialize() .expect("Unable to load and deserialize JSON"); + file.process_based_on_graph_data_type( graph_data_type, with_features, metadata_type.clone(), ); + debug!( + "Finished generating cgs + metadata for {:?}", + file.filename + ); } else { info!( "Skipping {} as already exists", full_output_path.to_string_lossy() ) } - }) - } else { - debug!("Creating call graphs with node features"); - debug!("Getting metadata file paths"); - // its more than one file - if metadata_path.is_none() { - error!("with features active - require --metadata-path argument"); - exit(1) - }; - - if with_features & metadata_type.is_none() { - error!("with features requires metadata_type to be set") - } - let mut metadata_paths_vec = get_json_paths_from_dir( - metadata_path.as_ref().unwrap(), - Some(metadata_type.as_ref().unwrap().to_string()), - ); - - file_paths_vec.sort(); - metadata_paths_vec.sort(); - - assert_eq!(file_paths_vec.len(), metadata_paths_vec.len()); - let combined_cgs_metadata = file_paths_vec - .into_iter() - .zip(metadata_paths_vec) - .collect::>(); - - combined_cgs_metadata.par_iter().progress().for_each( - |(filepath, metapath)| { - let suffix = format!("{}-meta", graph_type.to_owned()); - let full_output_path = get_save_file_path( - &PathBuf::from(filepath), - output_path, - Some(suffix), - None, - ); - if !full_output_path.is_dir() { - let mut file = { - let metadata: Option; - if metadata_type.clone().unwrap() == *"finfo" { - let mut metadata_file = AFIJFile { - filename: PathBuf::from(metapath), - function_info: None, - output_path: PathBuf::new(), - }; - debug!( - "Attempting to load metadata file: {}", - metapath - ); - metadata_file.load_and_deserialize().expect( - "Unable to load associated metadata file", - ); - metadata = Some(metadata_file.subset(false)); - } else if metadata_type.clone().unwrap() == *"tiknib" { - let mut metadata_file = TikNibFuncMetaFile { - filename: PathBuf::from(metapath), - function_info: None, - output_path: PathBuf::new(), - }; - - metadata_file.load_and_deserialize().expect( - "Unable to load associated metadata file", - ); - metadata = Some(metadata_file.subset()); - } else { - metadata = None - } - - AGCJFile { - filename: PathBuf::from(filepath), - function_call_graphs: None, - output_path: output_path.to_owned(), - function_metadata: metadata, - include_unk: *include_unk, - } - }; - debug!("Attempting to load {:?}", file.filename); - file.load_and_deserialize() - .expect("Unable to load and deserialize JSON"); - - file.process_based_on_graph_data_type( - graph_data_type, - with_features, - metadata_type.clone(), - ); - debug!( - "Finished generating cgs + metadata for {:?}", - file.filename - ); - } else { - info!( - "Skipping {} as already exists", - full_output_path.to_string_lossy() - ) - } - }, - ); - } + }, + ); } } }