From 35c8d57fde52eb6a40fae623799898a8ef40081c Mon Sep 17 00:00:00 2001 From: marypilataki Date: Wed, 20 Nov 2024 12:40:18 +0000 Subject: [PATCH 1/3] add option to provide audio samples for prediction --- basic_pitch/inference.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index 25b062c..58d6856 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -213,7 +213,7 @@ def window_audio_file( def get_audio_input( - audio_path: Union[pathlib.Path, str], overlap_len: int, hop_size: int + audio_path_or_array: Union[pathlib.Path, str, np.ndarray], sample_rate: int, overlap_len: int, hop_size: int ) -> Iterable[Tuple[npt.NDArray[np.float32], Dict[str, float], int]]: """ Read wave file (as mono), pad appropriately, and return as @@ -229,7 +229,17 @@ def get_audio_input( """ assert overlap_len % 2 == 0, f"overlap_length must be even, got {overlap_len}" - audio_original, _ = librosa.load(str(audio_path), sr=AUDIO_SAMPLE_RATE, mono=True) + if isinstance(audio_path_or_array, np.ndarray): + audio_original = audio_path_or_array + if sample_rate is None: + raise ValueError("Sample rate must be provided when input is an array of audio samples.") + elif sample_rate != AUDIO_SAMPLE_RATE: + audio_original = librosa.resample(audio_original, orig_sr=sample_rate, target_sr=AUDIO_SAMPLE_RATE) + # convert to mono if necessary + if audio_original.ndim != 1: + audio_original = librosa.to_mono(audio_path_or_array) + else: + audio_original, _ = librosa.load(str(audio_path_or_array), sr=AUDIO_SAMPLE_RATE, mono=True) original_length = audio_original.shape[0] audio_original = np.concatenate([np.zeros((int(overlap_len / 2),), dtype=np.float32), audio_original]) @@ -267,14 +277,16 @@ def unwrap_output( def run_inference( - audio_path: Union[pathlib.Path, str], + audio_path_or_array: Union[pathlib.Path, str, np.ndarray], + sample_rate: None, model_or_model_path: Union[Model, pathlib.Path, str], debug_file: Optional[pathlib.Path] = None, ) -> Dict[str, np.array]: """Run the model on the input audio path. Args: - audio_path: The audio to run inference on. + audio_path_or_array: The audio to run inference on. Can be either the path to an audio file or a numpy array of audio samples. + sample_rate: Sample rate of the audio file. Only used if audio_path_or_array is a np array. model_or_model_path: A loaded Model or path to a serialized model to load. debug_file: An optional path to output debug data to. Useful for testing/verification. @@ -292,7 +304,7 @@ def run_inference( hop_size = AUDIO_N_SAMPLES - overlap_len output: Dict[str, Any] = {"note": [], "onset": [], "contour": []} - for audio_windowed, _, audio_original_length in get_audio_input(audio_path, overlap_len, hop_size): + for audio_windowed, _, audio_original_length in get_audio_input(audio_path_or_array, sample_rate, overlap_len, hop_size): for k, v in model.predict(audio_windowed).items(): output[k].append(v) @@ -415,7 +427,8 @@ def save_note_events( def predict( - audio_path: Union[pathlib.Path, str], + audio_path_or_array: Union[pathlib.Path, str, np.ndarray], + sample_rate: int = None, model_or_model_path: Union[Model, pathlib.Path, str] = ICASSP_2022_MODEL_PATH, onset_threshold: float = 0.5, frame_threshold: float = 0.3, @@ -434,7 +447,8 @@ def predict( """Run a single prediction. Args: - audio_path: File path for the audio to run inference on. + audio_path_or_array: File path for the audio to run inference on or array of audio samples. + sample_rate: Sample rate of the audio file. Only used if audio_path_or_array is a np array. model_or_model_path: A loaded Model or path to a serialized model to load. onset_threshold: Minimum energy required for an onset to be considered present. frame_threshold: Minimum energy requirement for a frame to be considered present. @@ -449,9 +463,12 @@ def predict( """ with no_tf_warnings(): - print(f"Predicting MIDI for {audio_path}...") + if isinstance(audio_path_or_array, np.ndarray): + print("Predicting MIDI ...") + else: + print(f"Predicting MIDI for {audio_path_or_array}...") - model_output = run_inference(audio_path, model_or_model_path, debug_file) + model_output = run_inference(audio_path_or_array, sample_rate, model_or_model_path, debug_file) min_note_len = int(np.round(minimum_note_length / 1000 * (AUDIO_SAMPLE_RATE / FFT_HOP))) midi_data, note_events = infer.model_output_to_notes( model_output, From f5b5dc9f61a536b7d601b9ba22dfb7ef6d58b921 Mon Sep 17 00:00:00 2001 From: marypilataki Date: Tue, 3 Dec 2024 12:04:36 +1300 Subject: [PATCH 2/3] add comments --- basic_pitch/inference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/basic_pitch/inference.py b/basic_pitch/inference.py index 58d6856..af3c266 100644 --- a/basic_pitch/inference.py +++ b/basic_pitch/inference.py @@ -228,16 +228,18 @@ def get_audio_input( """ assert overlap_len % 2 == 0, f"overlap_length must be even, got {overlap_len}" - + # if a numpy array of samples is provided, use it directly if isinstance(audio_path_or_array, np.ndarray): audio_original = audio_path_or_array if sample_rate is None: raise ValueError("Sample rate must be provided when input is an array of audio samples.") + # resample audio if required elif sample_rate != AUDIO_SAMPLE_RATE: audio_original = librosa.resample(audio_original, orig_sr=sample_rate, target_sr=AUDIO_SAMPLE_RATE) # convert to mono if necessary if audio_original.ndim != 1: audio_original = librosa.to_mono(audio_path_or_array) + # load audio file else: audio_original, _ = librosa.load(str(audio_path_or_array), sr=AUDIO_SAMPLE_RATE, mono=True) From 450b8ffd5ad42c506d26207dfe2aab7aff27cfeb Mon Sep 17 00:00:00 2001 From: marypilataki Date: Wed, 4 Dec 2024 12:19:01 +1300 Subject: [PATCH 3/3] update readme --- README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 00acb6e..2f3c6d9 100644 --- a/README.md +++ b/README.md @@ -105,13 +105,18 @@ basic-pitch --help **predict()** -Import `basic-pitch` into your own Python code and run the [`predict`](basic_pitch/inference.py) functions directly, providing an `` and returning the model's prediction results: +Import `basic-pitch` into your own Python code and run the [`predict`](basic_pitch/inference.py) functions directly, providing an `` or an `` and returning the model's prediction results: ```python from basic_pitch.inference import predict -from basic_pitch import ICASSP_2022_MODEL_PATH +# get model predictions given an audio file model_output, midi_data, note_events = predict() + +# or alternatively, provide an array of samples +audio_array, sample_rate = librosa.load(, mono=True, duration=10.0, offset=5.0) +model_output, midi_data, note_events = predict(audio_array, sample_rate) + ``` - `` & `` (*float*s) set the maximum and minimum allowed note frequency, in Hz, returned by the model. Pitch events with frequencies outside of this range will be excluded from the prediction results.