From 68f6eb8c53c3363643faf5993fe19290298c7462 Mon Sep 17 00:00:00 2001
From: Lukas von Briel <91612110+lvb2000@users.noreply.github.com>
Date: Mon, 4 Nov 2024 21:23:37 +0100
Subject: [PATCH] [ext] Add instruction tracing to Orbetto

See #10.
---
 .github/workflows/build.yml               |    4 +-
 .gitignore                                |    4 +
 ext/orbetto/meson.build                   |   15 +-
 ext/orbetto/metrics/METRICS.md            |   68 ++
 ext/orbetto/metrics/display.py            |  475 ++++++++
 ext/orbetto/metrics/elfFileDecode.py      |  213 ++++
 ext/orbetto/metrics/metrics.py            |  236 ++++
 ext/orbetto/metrics/metrics.yml           |   55 +
 ext/orbetto/metrics/queries.py            |  175 +++
 ext/orbetto/src/LRUCache.hpp              |   71 ++
 ext/orbetto/src/TRACE.md                  |  169 +++
 ext/orbetto/src/device.cpp                |   10 +
 ext/orbetto/src/device.hpp                |   15 +
 ext/orbetto/src/mortrall.hpp              | 1220 +++++++++++++++++++++
 ext/orbetto/src/orbetto.cpp               |  151 ++-
 ext/orbetto/subprojects/croaring.wrap     |    4 +
 ext/orbetto/subprojects/orbuculum.wrap    |    2 +-
 src/emdbg/bench/data/fmu_v5x_orbtrace.cfg |    2 +-
 src/emdbg/bench/data/fmu_v6x_orbtrace.cfg |    2 +-
 src/emdbg/debug/data/cortex_m.gdb         |   34 +-
 src/emdbg/debug/data/orbuculum.gdb        |   79 +-
 21 files changed, 2962 insertions(+), 42 deletions(-)
 create mode 100644 ext/orbetto/metrics/METRICS.md
 create mode 100644 ext/orbetto/metrics/display.py
 create mode 100644 ext/orbetto/metrics/elfFileDecode.py
 create mode 100644 ext/orbetto/metrics/metrics.py
 create mode 100644 ext/orbetto/metrics/metrics.yml
 create mode 100644 ext/orbetto/metrics/queries.py
 create mode 100644 ext/orbetto/src/LRUCache.hpp
 create mode 100644 ext/orbetto/src/TRACE.md
 create mode 100644 ext/orbetto/src/mortrall.hpp
 create mode 100644 ext/orbetto/subprojects/croaring.wrap

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 421e1bd..725dca1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - run: sudo apt-get update
-      - run: sudo apt-get install -y libusb-1.0-0-dev libzmq3-dev meson libsdl2-dev libdwarf-dev libdw-dev libelf-dev libcapstone-dev python3-pip ninja-build protobuf-compiler
+      - run: sudo apt-get install -y libusb-1.0-0-dev libzmq3-dev meson libsdl2-dev libdwarf-dev libdw-dev libelf-dev libcapstone-dev python3-pip ninja-build protobuf-compiler cmake
       - run: sudo pip3 install meson==1.2.0
       - uses: actions/checkout@v3
       - run: meson setup ./build
@@ -27,7 +27,7 @@ jobs:
     name: Build and Test Orbetto on macOS
     runs-on: macos-13
     steps:
-      - run: brew install zmq sdl2 libelf protobuf meson ninja capstone dwarfutils
+      - run: brew install zmq sdl2 libelf protobuf meson ninja capstone dwarfutils cmake
       - uses: actions/checkout@v3
       - run: meson setup ./build
         working-directory: ext/orbetto
diff --git a/.gitignore b/.gitignore
index 7066030..e46c628 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,10 @@ dist/
 /ext/orbetto/subprojects/orbuculum
 /ext/orbetto/subprojects/libdwarf*
 /ext/orbetto/subprojects/packagecache
+/ext/orbetto/subprojects/CRoaring
 /ext/orbetto/*.perf
 /ext/orbetto/*.gz
 /ext/orbetto/*.debug
+/ext/orbetto/*.zip
+/ext/orbetto/*.roar
+/ext/orbetto/trace.swo
\ No newline at end of file
diff --git a/ext/orbetto/meson.build b/ext/orbetto/meson.build
index 822c021..19693a9 100644
--- a/ext/orbetto/meson.build
+++ b/ext/orbetto/meson.build
@@ -15,6 +15,17 @@ perfbuf_sources = cpp_gen.process(proto_sources, preserve_path_from: meson.curre
 # Compile and link the protobuf source files
 perfbuf_lib = static_library('perfbuf', sources: perfbuf_sources, dependencies: [dependency('protobuf')])
 
+# Specify the include directory for Roaring Bitmap
+cmake = import('cmake')
+rm = cmake.subproject('croaring')
+# Fetch the dependency object
+message('CMake targets:\n - ' + '\n - '.join(rm.target_list()))
+rm_dep1 = rm.dependency('roaring-headers-cpp')
+rm_dep2 = rm.dependency('roaring-headers')
+rm_dep = rm.dependency('roaring')
+rm_inc = rm.get_variable('roaring_headers_cpp_inc') + rm.get_variable('roaring_headers_inc')
+
+
 # Clone the orbuculum git repository
 orbuculum = subproject('orbuculum')
 liborb = orbuculum.get_variable('liborb')
@@ -24,8 +35,8 @@ git_version_info_h = orbuculum.get_variable('git_version_info_h')
 # Compile and link everything together
 executable('orbetto',
     sources: ['src/orbetto.cpp', 'src/device.cpp', git_version_info_h],
-    include_directories: ['src'] + perfbuf_lib.private_dir_include() + orbinc,
-    dependencies: [dependency('protobuf')] + orbuculum.get_variable('dependencies'),
+    include_directories: ['src'] + perfbuf_lib.private_dir_include() + orbinc + rm_inc,
+    dependencies: [dependency('protobuf')] + orbuculum.get_variable('dependencies') + [rm_dep] + [rm_dep1] + [rm_dep2],
     link_with: [liborb, perfbuf_lib],
     install: true,
 )
diff --git a/ext/orbetto/metrics/METRICS.md b/ext/orbetto/metrics/METRICS.md
new file mode 100644
index 0000000..42a7006
--- /dev/null
+++ b/ext/orbetto/metrics/METRICS.md
@@ -0,0 +1,68 @@
+# Trace Metrics with Plotly in Dash
+
+As explained in [TRACE.md](../src/TRACE.md), Perfetto is a very useful tool for
+displaying trace data, but is limited in data size and display variability.
+Therefore, as a proof of concept, some metrics are calculated in Python. We use
+the Protobuf file generated by Orbetto, the SQL trace processor from Perfetto
+and a program counter bitmap generated by CRoaring in Orbetto.
+
+
+## Visualising metrics
+
+To start the local dash server you need to pass a Protobuf file when analyzing
+ITM data or a bitmap for code coverage data. Depending on which argument you
+pass, different metrics can be displayed. Use `-h` to see all options. If you
+are using the `-diff` flag, you will need to provide a second Protobuf file to
+display the metrics based on the difference between the between the two files.
+This is very useful for comparing different versions or settings between two
+runs.
+
+```sh
+# cd orbetto/metrics
+# two metrics
+python3 metrics.py -f /path/to/orbetto.perf -wq -dwq
+# difference between two traces on one metric
+python3 metrics.py -f /path/to/orbetto_1.perf -f2 /path/to/orbetto_1.perf -hp -diff
+# Code coverage for pc bitmap (beta not completely accurate)
+python3 metrics.py -bm /path/to/bitmap.roar -elf /path/to/elf_file.elf -cc
+```
+
+Once executed, copy the URL into your browser and use the plotly interface to
+display the desired plot.
+
+> [!NOTE]  
+> Some metrics take a while to calculate (mainly code coverage).
+
+
+## Advanced metrics and goal
+
+There are already some simple statistical techniques included that can detect
+outliers, but the bigger picture of this work is to do regression testing and
+fuzzing with statistics derived from the instruction trace. For example:
+
+- [x] Regularity of sensor readings.
+- [ ] Comm link throughputs.
+- [x] Scheduling latency vs timeouts.
+- [ ] Thread progress vs semaphores.
+- [x] Callstack changes (code coverage).
+
+All these metrics need to be combined into a large latent space that can be
+diffed (a kind of distance measure) between different versions of PX4 to detect
+bugs in the code before it flies.
+
+
+## GCOV
+
+There has been a quick investigation to use a more advanced and also tested
+tool. The most common is gcov, which is a test coverage program from GCC. Two
+files are required for gcov to display code coverage:
+
+- `.gcno`, which is an advanced source file you get when compiling with the
+   GCC `-ftest-coverage` option.
+- `.gcda`, which is created when the GCC `-fprofile-arcs` option is run.
+
+It should be simple to get the .gcno file as the flag is already a compile
+option in PX4. Theoretically it should be possible to reverse engineer
+the `.gcda` file just from the PC values, ([documentation can be found here]
+(https://github.com/gcc-mirror/gcc/blob/master/gcc/gcov-io.h)) However, it is
+unclear how long this will take and how robust it will be.
diff --git a/ext/orbetto/metrics/display.py b/ext/orbetto/metrics/display.py
new file mode 100644
index 0000000..a4bd1aa
--- /dev/null
+++ b/ext/orbetto/metrics/display.py
@@ -0,0 +1,475 @@
+import plotly.graph_objects as go
+import plotly.express as px
+import numpy as np
+import math as m
+from dash import Dash, dcc, html, callback, Output, Input
+import plotly.figure_factory as ff
+from plotly.subplots import make_subplots
+from queries import *
+from elfFileDecode import *
+
+app = Dash(__name__)
+
+widgets = []
+active_thread = None
+df_functions = None
+df_heap = {}
+heap_title = None
+hist_data = {}
+df_detailed_thread_state = None
+df_code_coverage = None
+code_coverage_file = None
+
+def _find_outliers(thread_intervals,thread_ids):
+    outlier_id = []
+    for function_intervals,ids in zip(thread_intervals,thread_ids):
+        # get mean
+        mean = np.mean(function_intervals)
+        # get std
+        std = np.std(function_intervals)
+        # get outlier indexes
+        idx = np.where(np.abs(function_intervals - mean) > 3 * std)[0]
+        # get ids of outliers
+        outlier_id.extend(ids[idx].tolist())
+    if len(outlier_id) > 0:
+        return 'Outliers with Slice IDs: ' + ', '.join(map(lambda x:str(int(x)), outlier_id))
+    else:
+        return "No outliers found."
+
+def _add_button_for_norm_switch(fig):
+    # Create and add button
+    fig.update_layout(
+        updatemenus=[
+            dict(
+                buttons=list([
+                    dict(label="Percentage",
+                        method="update",
+                        args=[{'histnorm': 'percent'}]
+                    ),
+                    dict(label="Count",
+                        method="update",
+                        args=[{'histnorm': 'count'}]
+                    ),
+                ]),
+                direction="down",
+                pad={"r": 10, "t": 10},
+                showactive=True,
+                x=0,
+                xanchor="left",
+                y=1.2,
+                yanchor="top"
+            ),
+        ]
+    )
+
+def _add_slider_for_bin_size(fig,arr,title,unit):
+    # Create and add slider
+    num_steps = 100
+    min_value = min(arr)
+    max_value = max(arr)
+    diff = max_value - min_value
+    steps = []
+    for i in range(num_steps):
+        bin_size = diff * ((i+1)/100)**2
+        step = dict(
+            method="update",
+            args=[{'xbins.size': bin_size},{'xbins.start': 0}],  # layout attribute
+            label = f"{bin_size:.2f} {unit}"
+        )
+        # step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
+        steps.append(step)
+
+    sliders = [dict(
+        active=10,
+        currentvalue={"prefix": f"{title} "},
+        steps=steps
+    )]
+
+    fig.update_layout(
+        sliders=sliders
+    )
+
+def _add_dropdown_menu_bar_chart(fig):
+    # Add dropdown
+    fig.update_layout(
+        updatemenus=[
+            dict(
+                buttons=list([
+                    dict(label="Percentage",
+                        method="update",
+                        args=[{"visible": [True,False,True,False]},{'yaxis': {'title': 'Total CPU Time (%)'}}]
+                    ),
+                    dict(label="Time Values",
+                        method="update",
+                        args=[{"visible": [False,True,False,True]},{'yaxis': {'title': 'Total CPU Time (ns)'}}]
+                    ),
+                ]),
+                direction="down",
+                pad={"r": 10, "t": 10},
+                showactive=True,
+                x=0,
+                xanchor="left",
+                y=1.2,
+                yanchor="top"
+            ),
+        ]
+    )
+
+def bar_chart(df1,df2,x,y,z):
+    widgets.append(html.H1("Thread CPU Time Overview",style={'textAlign': 'center'}))
+    fig = go.Figure()
+    fig.add_trace(go.Bar(x=df1[x], y=df1[y],visible=True,name='Running'))
+    fig.add_trace(go.Bar(x=df1[x], y=df1[z],visible=False,name='Running'))
+    fig.add_trace(go.Bar(x=df2[x], y=df2[y],visible=True,name='Runnable'))
+    fig.add_trace(go.Bar(x=df2[x], y=df2[z],visible=False,name='Runnable'))
+    fig.update_layout(
+        xaxis_title='Thread Name',
+        yaxis_title='Total CPU Time (%)',
+        # change angle of xAxis labels
+        xaxis_tickangle=45,
+        legend_title_text='Thread State'
+    )
+    _add_dropdown_menu_bar_chart(fig)
+    widgets.append(dcc.Graph(figure=fig))
+
+def _add_dropdown_menu_threads(thread_list, type,title, multi,initial_value=[]):
+    widgets.append(html.H4("Select Threads:",style={'margin-bottom': '10px'}))
+    dropdown = dcc.Dropdown(id='thread-select'+ '-' + type + '-' + title,
+                          options=thread_list,
+                          multi= multi,
+                          clearable=True,
+                          value=initial_value,
+                          style={'width': '900px'})
+    widgets.append(dropdown)
+
+def _add_dropdown_menu_functions(id,multi):
+    widgets.append(html.H4("Select Function:",style={'margin-bottom': '10px','margin-left': '20px'}))
+    dropdown = dcc.Dropdown(id=id,
+                          options=[],
+                          multi= multi,
+                          clearable=True,
+                          value=[],
+                          style={'margin-left': '10px','width': '500px'})
+    widgets.append(dropdown)
+
+
+def custom_data_bar_chart(df):
+    global df_functions
+    df_functions = df
+    widgets.append(html.H1("Detailed Function Overview", style={'textAlign': 'center'}))
+    _add_dropdown_menu_threads(df['thread_name'].unique(),'function',False)
+    _add_dropdown_menu_functions('function-select',multi=True)
+    widgets.append(dcc.Graph(id='thread-graph',figure={}))
+
+def _add_dropdown_menu_reg_function(function_list):
+    widgets.append(html.H4("Select Function:",style={'margin-bottom': '10px'}))
+    dropdown = dcc.Dropdown(id='reg-function-select',
+                          options=function_list,
+                          multi= False,
+                          clearable=True,
+                          value=[])
+    widgets.append(dropdown)
+    # include button for reciprocal with increase size
+    widgets.append(html.Button('Toggle Reciprocal', id='toggle-reciprocal', n_clicks=0,style={'font-size': '15px','margin-top': '10px'}))
+
+def histogram(dict):
+    global hist_data
+    hist_data = dict
+    widgets.append(html.H1("Regularity Function Overview",style={'textAlign': 'center'}))
+    _add_dropdown_menu_reg_function(list(dict.keys()))
+    widgets.append(dcc.Graph(id='regularity-graph',figure={}))
+    widgets.append(html.H3("PDF",style={'textAlign': 'center','margin-bottom': '10px'}))
+    widgets.append(dcc.Graph(id='normal-dist-graph',figure={}))
+    widgets.append(html.H5(f"No outliers found.",id='outliers'))
+
+def _add_dropdown_menu_threads_runtime(thread_list):
+    widgets.append(html.H4("Select Thread:",style={'margin-bottom': '10px'}))
+    dropdown = dcc.Dropdown(id='thread-overview-select',
+                          options=thread_list,
+                          multi= False,
+                          clearable=True,
+                          value=[],
+                          style={'margin-left': '10px','width': '500px'})
+    widgets.append(dropdown)
+
+def cpu_waiting_time_pie_chart(df):
+    global df_detailed_thread_state
+    df_detailed_thread_state = df
+    widgets.append(html.H1("Thread CPU Time Detailed",style={'textAlign': 'center'}))
+    _add_dropdown_menu_threads_runtime(df['thread_name'].unique())
+    widgets.append(html.Button('Toggle Normalization', id='toggle-normalization', n_clicks=0,style={'font-size': '15px','margin-top': '10px'}))
+    widgets.append(dcc.Graph(id='aggr-thread-overview-graph',figure={}))
+    widgets.append(dcc.Graph(id='thread-overview-graph-runnable',figure={}))
+    widgets.append(dcc.Graph(id='thread-overview-graph-running',figure={}))
+    widgets.append(dcc.Graph(id='thread-overview-graph-sleeping',figure={}))
+
+def compute_heap_sum(df):
+    return df.groupby('thread_name').agg({'allocation_size': 'sum'}).reset_index()
+
+def heap_pie_chart(df,title):
+    widgets.append(html.H1("Heap Profile"+ " - (" + title + ')',style ={'textAlign': 'center'}))
+    df_aggr = compute_heap_sum(df)
+    df_aggr['allocation_size'] = df_aggr['allocation_size'].abs()
+    if df_aggr['allocation_size'].sum() == 0:
+        widgets.append(html.H3("Traces do not show memory differences when matched.",style={'textAlign': 'center'}))
+        return
+    else:
+        widgets.append(dcc.Graph(figure=pie_chart(df_aggr,'thread_name','allocation_size','Total Heap Usage per Thread')))
+
+
+def heap_counter(df,title):
+    global df_heap
+    df_heap[title] = df
+    _add_dropdown_menu_threads(df['thread_name'].unique(),type='heap',title=title,multi=True)
+    widgets.append(dcc.Graph(id='heap-graph' + '-' + title,figure={}))
+
+def display_code_coverage(df):
+    global df_code_coverage
+    df_code_coverage = df
+    widgets.append(html.H1('Code Coverage',style ={'textAlign': 'center'}))
+    _add_dropdown_menu_threads(df['File'].unique(),type='code-coverage',title='file-select',multi=False)
+    _add_dropdown_menu_functions('function-select-code-coverage',multi=False)
+    #widgets.append(html.Pre(id='highlighted-code', style={'overflow': 'auto', 'height': '500px'}))
+    widgets.append(html.Div(id='highlighted-code', style={
+        'overflow': 'auto', 
+        'height': '500px', 
+        'border': '1px solid black', 
+        'font-family': 'monospace',
+        'white-space': 'pre-wrap',  # Preserve whitespace
+        'background-color': '#f0f0f0',  # Optional: light gray background for better visibility
+    }))
+
+def show(debug):
+    app.layout = html.Div(widgets,style={
+        'display': 'flex',
+        'flexDirection': 'column',
+        'alignItems': 'center',
+        'justifyContent': 'center',
+    })
+    app.run_server(debug=debug)
+
+#---------------------- Dash CallBacks ----------------------#
+@callback(
+    Output('function-select', 'options'),
+    Input('thread-select-function', 'value'),
+)
+def update_function_callback(thread):
+    if thread is None:
+        return []
+    global active_thread
+    active_thread = str(thread)
+    return df_functions[df_functions['thread_name'] == active_thread]['slice_name'].unique().tolist()
+
+@callback(
+    Output('thread-graph', 'figure'),
+    Input('function-select', 'value'),
+)
+def update_counter_function(selected_functions):
+    fig = go.Figure()
+    if active_thread is None:
+        return fig
+    df = df_functions[df_functions['thread_name'] == active_thread]
+    for function in selected_functions:
+        x_axis = np.sort(np.array(df[df['slice_name'] == function]['tss'].iloc[0].split(',')).astype(int)) / 1e6
+        count = df[df['slice_name'] == function]['count'].iloc[0]
+        y_axis = [i for i in range(1,count + 1)]
+        fig.add_trace(go.Scatter(x=x_axis,
+                                 y=y_axis,
+                                 mode='lines+markers',
+                                 name=function))
+    fig.update_layout(
+        xaxis_title='Time [ms]',
+        yaxis_title='Appearence',
+    )
+    return fig
+
+@callback(
+    Output('regularity-graph', 'figure'),
+    Output('normal-dist-graph', 'figure'),
+    Output('outliers', 'children'),
+    Input('reg-function-select', 'value'),
+    Input('toggle-reciprocal', 'n_clicks'),
+)
+def update_reg_graph(selected_function,reciprocal):
+    fig = go.Figure()
+    if selected_function is None or len(selected_function) == 0:
+        return fig,fig,"No outliers found."
+    title = 'Interval [ms]'
+    unit = 'ms'
+    if reciprocal % 2 == 1:
+        title = 'Frequency [kHz]'
+        unit = 'kHz'
+    np_hist = np.array(hist_data[selected_function])
+
+    threads = np.unique(np_hist[:,2])
+    thread_names = []
+    thread_ids = []
+    thread_intervals = []
+    for thread in threads:
+        function_intervals = np_hist[np_hist[:,2] == thread][:,0].tolist()
+        # remove None values
+        function_intervals = [x for x in function_intervals if x is not None]
+        function_intervals = np.divide(function_intervals,1e6)
+        if reciprocal % 2 == 1:
+            function_intervals = np.reciprocal(function_intervals)
+        if len(function_intervals) >= 2:
+            thread_intervals.append(function_intervals)
+            thread_names.append(thread)
+            thread_ids.append(np_hist[np_hist[:,2] == thread][:,1])
+            fig.add_trace(
+                go.Histogram(
+                    x=function_intervals,
+                    histnorm='percent',
+                    name=thread,
+                )
+            )
+    fig.update_layout(
+        xaxis_title=title,
+    )
+    _add_slider_for_bin_size(fig,function_intervals,title,unit)
+    _add_button_for_norm_switch(fig)
+    normal_dist_fig = ff.create_distplot(thread_intervals,group_labels=thread_names,bin_size=.1)
+    normal_dist_fig.update_layout(
+        xaxis_title=title,
+        yaxis_title='Probability Density',
+    )
+    outliers = _find_outliers(thread_intervals,thread_ids)
+    return fig,normal_dist_fig,outliers
+
+@callback(
+    Output('aggr-thread-overview-graph', 'figure'),
+    Output('thread-overview-graph-running', 'figure'),
+    Output('thread-overview-graph-runnable', 'figure'),
+    Output('thread-overview-graph-sleeping', 'figure'),
+    Input('thread-overview-select', 'value'),
+    Input('toggle-normalization', 'n_clicks')
+)
+def update_thread_overview(selected_thread,n_clicks):
+    fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
+    fig2 = go.Figure()
+    fig3 = go.Figure()
+    fig4 = go.Figure()
+    if selected_thread is None:
+        return fig,fig2,fig3,fig4
+    df = df_detailed_thread_state[df_detailed_thread_state['thread_name'] == str(selected_thread)]
+    if len(df) == 0:
+        return fig,fig2,fig3,fig4
+    df['runnable_interval'] = df['runnable_interval']/1e6
+    df['running_interval'] = df['running_interval']/1e6
+    df['sleeping_interval'] = df['sleeping_interval']/1e6
+    df_aggr = df.groupby('slice_name').agg({'runnable_interval': 'sum', 'running_interval': 'sum', 'sleeping_interval': 'sum'}).reset_index()
+    title1 = 'Runnable'
+    title2 = 'Running'
+    title3 = 'Sleeping'
+    if n_clicks % 2 == 1:
+        df_aggr = df.groupby('slice_name').agg({'runnable_interval': 'mean', 'running_interval': 'mean', 'sleeping_interval': 'mean'}).reset_index()
+        title1 = 'Average Runnable'
+        title2 = 'Average Running'
+        title3 = 'Average Sleeping'
+    fig.add_trace(go.Pie(labels=df_aggr['slice_name'],
+                        values=df_aggr['runnable_interval'],
+                        title=title1),
+                row = 1, col = 1)
+    fig.add_trace(go.Pie(labels=df_aggr['slice_name'],
+                        values=df_aggr['running_interval'],
+                        title=title2),
+                row = 1, col = 2 )
+    fig.add_trace(go.Pie(labels=df_aggr['slice_name'],
+                        values=df_aggr['sleeping_interval'],
+                        title=title3),
+                row = 1, col = 3 )
+    fig2 = px.histogram(df, x='runnable_interval',color = 'slice_name', marginal="rug", hover_data=['slice_id','slice_name'], title='Runnable')
+    fig2.update_layout(
+        xaxis_title='Duration [ms]'
+    )
+    fig3 = px.histogram(df, x='running_interval',color = 'slice_name', marginal="rug", hover_data=['slice_id','slice_name'], title='Running')
+    fig3.update_layout(
+        xaxis_title='Duration [ms]'
+    )
+    fig4 = px.histogram(df, x='sleeping_interval',color = 'slice_name', marginal="rug", hover_data=['slice_id','slice_name'], title='Sleeping')
+    fig4.update_layout(
+        xaxis_title='Duration [ms]'
+    )
+    return fig,fig2,fig3,fig4
+
+@callback(
+    Output('heap-graph-counter', 'figure'),
+    Input('thread-select-heap-counter', 'value'),
+)
+def update_counter_heap(selected_threads):
+    return counter_bar_chart(df_heap['counter'],selected_threads,'thread_name','allocation_size','ts','Heap Counter [Bytes]')
+
+@callback(
+    Output('heap-graph-counter_matched', 'figure'),
+    Input('thread-select-heap-counter_matched', 'value'),
+)
+def update_counter_heap(selected_threads):
+    return counter_bar_chart(df_heap['counter_matched'],selected_threads,'thread_name','allocation_size','ts','Heap Counter [Bytes]')
+
+@callback(
+    Output('function-select-code-coverage', 'options'),
+    Input('thread-select-code-coverage-file-select', 'value'),
+)
+def update_code_coverage_function_select(selected_file):
+    global code_coverage_file
+    if selected_file is None or len(selected_file) == 0:
+        print(selected_file)
+        return ['empty']
+    code_coverage_file = selected_file
+    return df_code_coverage[df_code_coverage['File'] == selected_file]['Function Name'].unique().tolist()
+
+@callback(
+    Output('highlighted-code', 'children'),
+    Input('function-select-code-coverage', 'value'),
+)
+def update_code_coverage_highlighted_code(selected_function):
+    if selected_function is None or len(selected_function) == 0:
+        return ""
+    source_code = get_function_info(bytes(code_coverage_file, 'utf-8'), bytes(selected_function, 'utf-8'))
+    if len(source_code) == 0:
+        return f"Function {selected_function} not found in {code_coverage_file}."
+    return html.Div(children=[html.Div(children=source_code, style={'white-space': 'pre-wrap'})])
+
+#---------------------- General Plots ----------------------#
+
+def counter_bar_chart(df,selection,category,value,ts,title):
+    fig = go.Figure()
+    if selection is None:
+        return fig
+    max_ts = df[ts].max()
+    for s in selection:
+        df_s = df[df[category] == s]
+        df_s = df_s.sort_values(by=[ts])
+        x_axis = np.array(df_s[ts]) / 1e6
+        y_axis = df_s[value].cumsum().to_numpy()
+        if(x_axis != [] and x_axis[-1] < max_ts):
+            x_axis = np.append(x_axis,max_ts / 1e6)
+            y_axis = np.append(y_axis,y_axis[-1])
+        fig.add_trace(go.Scatter(x=x_axis,
+                                 y=y_axis,
+                                 mode='lines+markers',
+                                 name=s))
+    fig.update_layout(
+        xaxis_title='Time [ms]',
+        yaxis_title=title,
+        height=700,
+        width=1400
+    )
+    return fig
+
+def pie_chart(df,category,value,title):
+    fig = go.Figure()
+    fig.add_trace(go.Pie(labels = df[category],
+                        values = df[value],
+                        title=title
+                        )
+                )
+    fig.update_layout(
+        height=800,
+        width=800,
+    )
+    fig.update_traces(
+        textposition='inside',
+        automargin=True
+    )
+    return fig
diff --git a/ext/orbetto/metrics/elfFileDecode.py b/ext/orbetto/metrics/elfFileDecode.py
new file mode 100644
index 0000000..6d43015
--- /dev/null
+++ b/ext/orbetto/metrics/elfFileDecode.py
@@ -0,0 +1,213 @@
+from elftools.common.utils import bytes2str
+from elftools.dwarf.descriptions import describe_form_class
+from elftools.elf.elffile import ELFFile
+import pandas as pd
+from pyroaring import BitMap
+import numpy as np
+from dash import html
+
+GREEN = "\033[92m"  # Green color code
+RESET = "\033[0m"   # Reset color code
+
+elf_path = None
+roar_path = None
+bitmap = None
+
+# Documentation for dwarf classes in pyelftools can be found here:
+# https://github.com/eliben/pyelftools/tree/main/elftools/dwarf
+
+def set_elffile_and_bitmap(elf_path_input, roar_path_input):
+    global elf_path
+    global roar_path
+    elf_path = elf_path_input
+    roar_path = roar_path_input
+
+def get_all_function_names():
+    if elf_path is None:
+        print("Please set the elf file")
+        return
+    
+    print('Processing file:', elf_path)
+    with open(elf_path, 'rb') as f:
+        elffile = ELFFile(f)
+
+        if not elffile.has_dwarf_info():
+            print('  file has no DWARF info')
+            return
+
+        # get_dwarf_info returns a DWARFInfo context object, which is the
+        # starting point for all DWARF-based processing in pyelftools.
+        dwarfinfo = elffile.get_dwarf_info()
+
+        # Return Pandas DataFrame with all function information
+        df = pd.DataFrame(columns=['File','Function Name'])
+
+        #count = 0
+        for CU in dwarfinfo.iter_CUs():
+            #if count > 100:
+            #    break
+            #count += 1
+            TOP_DIE = CU.get_top_DIE()
+            if ".cpp" in bytes2str(TOP_DIE.attributes['DW_AT_name'].value):
+                path = TOP_DIE.attributes['DW_AT_name'].value
+            else:
+                path = TOP_DIE.attributes['DW_AT_comp_dir'].value + bytes('/', 'utf-8') + TOP_DIE.attributes['DW_AT_name'].value
+            for DIE in CU.iter_DIEs():
+                # check for function tag
+                if DIE.tag == 'DW_TAG_subprogram':
+                    try:
+                        if DIE.attributes['DW_AT_inline'].value == 3:
+                            continue
+                    except KeyError:
+                        pass
+                    try:
+                        DIE.attributes['DW_AT_low_pc'].value
+                        DIE.attributes['DW_AT_high_pc'].value
+                    except KeyError:
+                        continue
+                    # check for function name
+                    try :
+                        if DIE.attributes['DW_AT_name'] is not None:
+                            df = pd.concat([df, pd.DataFrame({'File': [bytes2str(path)], 'Function Name': [bytes2str(DIE.attributes['DW_AT_name'].value)]})])
+                    except KeyError:
+                        continue
+        return df
+
+
+def get_function_info(function_path, function_name):
+
+    if elf_path is None:
+        print("Please set the elf file")
+        return
+    
+    with open(elf_path, 'rb') as f:
+        elffile = ELFFile(f)
+
+        if not elffile.has_dwarf_info():
+            print('  file has no DWARF info')
+            return
+
+        # get_dwarf_info returns a DWARFInfo context object, which is the
+        # starting point for all DWARF-based processing in pyelftools.
+        dwarfinfo = elffile.get_dwarf_info()
+
+        # Return Pandas DataFrame with all function information
+        source_code = decode_funcname( dwarfinfo, function_path, function_name)
+
+    return source_code
+
+
+def decode_funcname(dwarfinfo, function_path, function_name):
+    source_code = []
+    for CU in dwarfinfo.iter_CUs():
+        try:
+            TOP_DIE = CU.get_top_DIE()
+            if ".cpp" in bytes2str(TOP_DIE.attributes['DW_AT_name'].value):
+                path = TOP_DIE.attributes['DW_AT_name'].value
+            else:
+                path = TOP_DIE.attributes['DW_AT_comp_dir'].value + bytes('/', 'utf-8') + TOP_DIE.attributes['DW_AT_name'].value
+            if(path == function_path):
+                for DIE in CU.iter_DIEs():
+                    # check for function tag
+                    if DIE.tag == 'DW_TAG_subprogram':
+                        # check for function name
+                        try :
+                            DIE.attributes['DW_AT_name'].value
+                        except KeyError:
+                            continue
+                        if DIE.attributes['DW_AT_name'].value == function_name:
+                            low_line, high_line, l, a, inline = get_max_min_line(CU, dwarfinfo)
+                            if low_line is not None and high_line is not None:
+                                source_code = get_code_lines(function_path, low_line, high_line)
+                                lines = np.arange(low_line, high_line)
+                                for i,line in enumerate(lines):
+                                    idx = np.where(np.isin(l, line))
+                                    if len(idx[0]) > 0 and len(source_code) > i:
+                                        # check if address is in bitmap
+                                        if a[idx[0][0]] in bitmap:
+                                            print(f"Line {line} with Address {hex(a[idx[0][0]])} is covered")
+                                            source_code[i] = html.Mark(source_code[i], style={'background-color': "rgba(0, 255, 0, 0.1)"})
+                                        else:
+                                            source_code[i] = html.Mark(source_code[i], style={'background-color': "rgba(255, 0, 0, 0.1)"})
+        except KeyError:
+            continue
+    return source_code
+
+def get_max_min_line(CU, dwarfinfo):
+    lineprog = dwarfinfo.line_program_for_CU(CU)
+    max = 0
+    min = 100000
+    prev_addr = 0
+    lines = []
+    addresses = []
+    inline = []
+    for entry in lineprog.get_entries():
+        if entry.state is None:
+            continue
+        if prev_addr != entry.state.address:
+            prev_addr = entry.state.address
+            if entry.state.line not in lines:
+                lines.append(entry.state.line)
+                addresses.append(entry.state.address)
+                if entry.state.file != 1:
+                    inline.append(True)
+                else:
+                    inline.append(False)
+        if entry.state.line > max:
+            max = entry.state.line
+        if entry.state.line < min:
+            min = entry.state.line
+    if max == 0 or min == 100000:
+        return None, None, None, None, None
+    
+    return min, max, lines, addresses, inline
+
+def get_code_lines(file_name, low_line, high_line):
+    try:
+        with open(file_name, 'r') as file:
+            lines = file.readlines()
+            return lines[low_line - 1:high_line]
+    except FileNotFoundError:
+        return f"File {file_name} not found."
+    except IndexError:
+        return f"Lines {low_line}:{high_line} out of range in {file_name}."
+    
+def read_roar_file():
+    """
+    Read a Roaring bitmap from a .roar file, assuming the file contains
+    serialized 8-bit unsigned integers.
+    """
+    values = []
+    with open(roar_path, 'rb') as f:
+        # Read the entire file content as bytes
+        data = f.read()
+        
+        # Ensure that the length of the data is not empty
+        if len(data) == 0:
+            raise ValueError("File is empty. No data to read.")
+        
+        # Each byte is an 8-bit unsigned integer
+        for byte in data:
+            values.append(byte)  # Append each byte (0-255) to the list
+
+    return values
+
+def init_bitmap():
+    global bitmap
+    #Load values from the .roar file
+    values_from_file = read_roar_file()
+
+    # Create a BitMap instance and update it with the values
+    bm = BitMap()
+    bm = bm.deserialize(bytes(values_from_file))
+
+    bitmap = bm.to_array()
+
+def format_code_coverage(df):
+    # Function to apply styling without loop
+    def highlight_green(column):
+        # Create a Series where True values get 'background-color: green' and False gets ''
+        return ['background-color: green' if v else '' for v in df['Highlight']]
+
+    # Apply the style
+    return df.style.apply(highlight_green, subset=['Code'])
diff --git a/ext/orbetto/metrics/metrics.py b/ext/orbetto/metrics/metrics.py
new file mode 100644
index 0000000..7d4e3e8
--- /dev/null
+++ b/ext/orbetto/metrics/metrics.py
@@ -0,0 +1,236 @@
+import pandas as pd
+from queries import *
+from display import *
+from elfFileDecode import *
+import argparse
+
+def init_argparse():
+    parser = argparse.ArgumentParser(description="Visualize trace metrics")
+    parser.add_argument('-f','--file',
+                        help='Path to protobuf trace file',
+                        type = str,
+                        required = False)
+    parser.add_argument('-f2','--file2',
+                        help='Path to second protobuf trace file for diffs',
+                        type = str,
+                        required = False)
+    parser.add_argument('-wq','--work_queue_overview',
+                        help='Display work queue overview. (Runnable/Running)',
+                        action='store_true',
+                        default = False)
+    parser.add_argument('-dwq','--detailed_work_queue',
+                        help='Display detailed work queue overview. (Runnable/Running)',
+                        action='store_true',
+                        default = False)
+    parser.add_argument('-fp','--function_pairs',
+                        help='Display function pairs',
+                        action='store_true',
+                        default = False)
+    parser.add_argument('-fr','--function_regularity',
+                        help='Display function regularity',
+                        action='store_true',
+                        default = False)
+    parser.add_argument('-dma','--dma_throughputs',
+                        help='Display DMA throughput',
+                        action='store_true',
+                        default = False)
+    parser.add_argument('-sd','--semaphore_deadlocks',
+                        help='Display semaphore deadlocks',
+                        action='store_true',
+                        default = False)
+    parser.add_argument('-db','--debug',
+                        help='Display debug information',
+                        action='store_true',
+                        default = False)
+    parser.add_argument('-hp', "--heap_profile",
+                        help='Display heap profile',
+                        action='store_true',
+                        default = False)
+    parser.add_argument('-cc',"--code_coverage",
+                        help='Display code coverage',
+                        action='store_true',
+                        default = False)
+    parser.add_argument('-bm',"--bitmap",
+                        help='Path to bitmap file which is needed for code coverage',
+                        type = str,
+                        required = False)
+    parser.add_argument('-elf',"--elf_file",
+                        help='Path to elf file which is needed for code coverage',
+                        type = str,
+                        required = False)
+
+    parser.add_argument('-diff', "--difference",
+                        help='Switch all other options to display differences between two trace files',
+                        action='store_true',
+                        default = False)
+    
+    return parser.parse_args()
+
+
+
+def cpu_time():
+    cpu_running = get_all_cpu_time('Running')
+    cpu_runnable = get_all_cpu_time('R')
+    total_cpu_running_time = cpu_running['cpu_time'].sum()
+    cpu_running['percentage'] = cpu_running['cpu_time']/total_cpu_running_time * 100
+    total_cpu_runnable_time = cpu_runnable['cpu_time'].sum()
+    cpu_runnable['percentage'] = cpu_runnable['cpu_time']/total_cpu_runnable_time * 100
+    bar_chart(cpu_running,cpu_runnable,'thread_name','percentage','cpu_time')
+
+def cpu_waiting_time():
+    df = get_detailed_thread_state_perfetto()
+    cpu_waiting_time_pie_chart(df)
+
+def function_distribution():
+    functions = get_function_distribution()
+    custom_data_bar_chart(functions)
+
+def regularity():
+    df = get_function_intervals()
+    df['merged'] = df.apply(lambda row: (row['interval'], row['next_slice_id'],row['thread_name']), axis=1)
+    dict = df.groupby('slice_name')['merged'].apply(list).to_dict()
+    histogram(dict)
+
+def dma_throughputs():
+    pass
+
+def semaphore_deadlocks():
+    pass
+
+def extract_malloc(df):
+    df_malloc = df[df['allocation_type']=='malloc']
+    df_malloc['allocation_info'] = df_malloc['slice_name'].apply(lambda x: x.split('[')[1][:-1])
+    df_malloc['allocation_pointer'] = df_malloc['allocation_info'].apply(lambda x: x.split(',')[0])
+    df_malloc['allocation_size'] = df_malloc['allocation_info'].apply(lambda x: int(x.split(',')[1]))
+    return df_malloc
+
+def extract_free(df):
+    df_free = df[df['allocation_type']=='free']
+    df_free['allocation_pointer'] = df_free['slice_name'].str.extract(r'\((.*?)\)')
+    df_free['allocation_size'] = df_free['slice_name'].apply(lambda x: -int(x.split('(')[2][:-1]))
+    return df_free
+
+
+def match_heap_pointers(df1,df2):
+    # copy
+    df_malloc = df1.copy()
+    df_free = df2.copy()
+    # convert to int
+    df_malloc['ts'] = df_malloc['ts'].astype(int)
+    df_free['ts'] = df_free['ts'].astype(int)
+    # reset index
+    df_malloc.reset_index(drop=True, inplace=True)
+    df_free.reset_index(drop=True, inplace=True)
+    for index, row in df_malloc.iterrows():
+        # Filter rows that match the conditions
+        future_frees = df_free[(df_free['allocation_pointer'] == row['allocation_pointer']) &
+                               (df_free['ts'] >= row['ts'])]
+        if not future_frees.empty:
+            df_malloc.drop(index, inplace=True)
+            df_free.drop(future_frees['ts'].idxmin(), inplace=True)
+    assert(df_free.empty)
+    return df_malloc
+
+
+def extract_heap_profile(df):
+    if df.empty:
+        print("No heap profile data found")
+        return None
+    df['allocation_type'] = df['slice_name'].apply(lambda x: x.split('(')[0])
+    df_malloc = extract_malloc(df)
+    df_free = extract_free(df)
+    return pd.concat([df_malloc,df_free]), match_heap_pointers(df_malloc,df_free)
+
+def heap_profile():
+    df = get_heap_profile()
+    df, df_matched = extract_heap_profile(df)
+    if(df is not None):
+        heap_pie_chart(df, title="absolute count")
+        heap_counter(df, title="counter")
+        heap_pie_chart(df_matched , title="absolute count matched")
+        heap_counter(df_matched, title="counter_matched")
+
+
+def diff_heap_profile(df1,df2):
+    max_ts = min(df1['ts'].max(),df2['ts'].max())
+    df1 = df1[df1['ts'] <= max_ts]
+    df2 = df2[df2['ts'] <= max_ts]
+    df2['allocation_size'] = df2['allocation_size'] * -1
+    return pd.concat([df1,df2])
+
+def heap_profile_diff():
+    df1 = get_heap_profile()
+    df1, df1_matched = extract_heap_profile(df1)
+    df2 = get_heap_profile2()
+    df2, df2_matched = extract_heap_profile(df2)
+    if(df1 is not None and df2 is not None):
+        df = diff_heap_profile(df1,df2)
+        df_matched = diff_heap_profile(df1_matched,df2_matched)
+        heap_pie_chart(df, title="absolute count")
+        heap_counter(df, title="counter")
+        heap_pie_chart(df_matched, title="absolute count matched")
+        heap_counter(df_matched, title="counter_matched")
+
+
+def code_coverage(elf, path):
+    set_elffile_and_bitmap(elf, path)
+    init_bitmap()
+    df = get_all_function_names()
+    display_code_coverage(df)
+
+
+def code_coverage_diff():
+    pass
+    
+if __name__ == "__main__":
+    args = init_argparse()
+    init_trace_processor(args.file)
+    if args.difference:
+        assert(args.file2)
+        init_trace_processor2(args.file2)  
+    if args.work_queue_overview:
+        if args.difference:
+            pass
+        else:
+            cpu_time()
+    if args.detailed_work_queue:
+        if args.difference:
+            pass
+        else:
+            cpu_waiting_time()
+    if args.function_pairs:
+        if args.difference:
+            pass
+        else:
+            function_distribution()
+    if args.function_regularity:
+        if args.difference:
+            pass
+        else:
+            regularity()
+    if args.dma_throughputs:
+        if args.difference:
+            pass
+        else:
+            dma_throughputs()
+    if args.semaphore_deadlocks:
+        if args.difference:
+            pass
+        else:
+            semaphore_deadlocks()
+    if args.heap_profile:
+        if args.difference:
+            heap_profile_diff()
+        else:
+            heap_profile()
+    if args.code_coverage:
+        if args.bitmap and args.elf_file:
+            if args.difference:
+                pass
+            else:
+                code_coverage(args.elf_file, args.bitmap)
+        else:
+            print("Please provide a bitmap file")
+    show(args.debug)
+
+
diff --git a/ext/orbetto/metrics/metrics.yml b/ext/orbetto/metrics/metrics.yml
new file mode 100644
index 0000000..d04dd20
--- /dev/null
+++ b/ext/orbetto/metrics/metrics.yml
@@ -0,0 +1,55 @@
+name: metrics
+channels:
+  - defaults
+  - conda-forge
+dependencies:
+  - bzip2=1.0.8=h80987f9_6
+  - ca-certificates=2024.9.24=hca03da5_0
+  - libffi=3.4.4=hca03da5_1
+  - ncurses=6.4=h313beb8_0
+  - openssl=3.0.15=h80987f9_0
+  - pip=24.2=py311hca03da5_0
+  - python=3.11.10=hb885b13_0
+  - readline=8.2=h1a28f6b_0
+  - setuptools=75.1.0=py311hca03da5_0
+  - sqlite=3.45.3=h80987f9_0
+  - tk=8.6.14=h6ba3021_0
+  - wheel=0.44.0=py311hca03da5_0
+  - xz=5.4.6=h80987f9_1
+  - zlib=1.2.13=h18a0788_1
+  - pip:
+      - blinker==1.8.2
+      - certifi==2024.8.30
+      - charset-normalizer==3.4.0
+      - click==8.1.7
+      - dash==2.18.1
+      - dash-core-components==2.0.0
+      - dash-html-components==2.0.0
+      - dash-table==5.0.0
+      - flask==3.0.3
+      - idna==3.10
+      - importlib-metadata==8.5.0
+      - itsdangerous==2.2.0
+      - jinja2==3.1.4
+      - markupsafe==3.0.2
+      - nest-asyncio==1.6.0
+      - numpy==2.1.2
+      - packaging==24.1
+      - pandas==2.2.3
+      - perfetto==0.11.0
+      - plotly==5.24.1
+      - protobuf==5.28.3
+      - pyelftools==0.31
+      - pyroaring==1.0.0
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.2
+      - requests==2.32.3
+      - retrying==1.3.4
+      - six==1.16.0
+      - tenacity==9.0.0
+      - typing-extensions==4.12.2
+      - tzdata==2024.2
+      - urllib3==2.2.3
+      - werkzeug==3.0.6
+      - zipp==3.20.2
+prefix: /Users/lukasvonbriel/anaconda3/envs/metrics
diff --git a/ext/orbetto/metrics/queries.py b/ext/orbetto/metrics/queries.py
new file mode 100644
index 0000000..775b21f
--- /dev/null
+++ b/ext/orbetto/metrics/queries.py
@@ -0,0 +1,175 @@
+from perfetto.trace_processor import TraceProcessor
+
+tp = None
+tp2 = None
+
+def init_trace_processor(trace):
+    global tp
+    tp = TraceProcessor(trace=trace)
+    print(tp)
+
+def init_trace_processor2(trace):
+    global tp2
+    tp2 = TraceProcessor(trace=trace)
+
+def display_all_available_tables():
+    # Display all the available columns in the trace
+    return tp.query("SELECT name FROM sqlite_master WHERE type='table';").as_pandas_dataframe()
+
+def get_all_cpu_time(state):
+    query= f"""
+    DROP VIEW IF EXISTS cpu_time;
+    CREATE VIEW cpu_time AS
+    SELECT
+        dur,
+        thread.name as thread_name
+    FROM thread_state
+    JOIN thread USING (utid)
+    WHERE state = '{state}';
+
+    SELECT *
+    FROM (
+        SELECT 
+            SUM(dur) AS cpu_time,
+            thread_name
+        FROM cpu_time
+        GROUP BY thread_name
+    ) AS subquery
+    ORDER BY cpu_time DESC;
+    """
+    return tp.query(query).as_pandas_dataframe()
+
+
+def get_function_distribution():
+    query = """
+    DROP VIEW IF EXISTS slice_with_thread_names;
+    CREATE VIEW slice_with_thread_names AS
+    SELECT
+        ts,
+        dur,
+        tid,
+        slice.name as slice_name,
+        slice.id as slice_id,utid,
+        thread.name as thread_name
+    FROM slice
+    JOIN thread_track ON thread_track.id = slice.track_id
+    JOIN thread USING (utid);
+
+    SELECT *
+    FROM (
+        SELECT 
+            SUM(dur) AS cpu_time,
+            GROUP_CONCAT(DISTINCT ts) AS tss,
+            tid,
+            slice_name,
+            thread_name,
+            COUNT(*) AS count
+        FROM slice_with_thread_names
+        GROUP BY slice_name, tid
+    ) AS subquery
+    WHERE tid > 100000
+    ORDER BY cpu_time DESC;
+    """
+    return tp.query(query).as_pandas_dataframe()
+
+def get_function_intervals():
+    query = """
+    DROP VIEW IF EXISTS slice_with_thread_names;
+    CREATE VIEW slice_with_thread_names AS
+    SELECT
+        ts,
+        dur,
+        tid,
+        slice.name as slice_name,
+        slice.id as slice_id,utid,
+        thread.name as thread_name
+    FROM slice
+    JOIN thread_track ON thread_track.id = slice.track_id
+    JOIN thread USING (utid);
+
+    SELECT *
+    FROM(
+        SELECT
+            ts,
+            dur,
+            LEAD(ts) OVER (PARTITION BY slice_name ORDER BY ts) AS next_ts,
+            LEAD(ts) OVER (PARTITION BY slice_name ORDER BY ts) - (ts + dur) AS interval,
+            slice_name,
+            LEAD(slice_id) OVER (PARTITION BY slice_name ORDER BY ts) AS next_slice_id,
+            thread_name
+        FROM (
+            SELECT
+                ts - 35696370 AS ts,
+                dur,
+                tid,
+                slice_name,
+                slice_id,
+                thread_name
+            FROM slice_with_thread_names
+            ORDER BY ts
+        ) AS subquery1
+        WHERE tid > 100000
+        ORDER BY slice_name, ts
+    )AS subquery2;
+    """
+    return tp.query(query).as_pandas_dataframe()
+
+heap_query ="""
+    DROP VIEW IF EXISTS mallocs;
+    CREATE VIEW mallocs AS
+    SELECT
+        slice.ts,
+        slice.id as slice_id,
+        slice.name as slice_name,
+        thread.name as thread_name
+    FROM slice
+    JOIN thread_track ON thread_track.id = slice.track_id
+    JOIN thread USING (utid)
+    WHERE (slice.name LIKE '%malloc%' AND slice.name LIKE '%[%') OR (slice.name LIKE '%free%' AND slice.name LIKE '%<-%');
+
+    SELECT *
+    FROM mallocs
+    ORDER BY ts;
+    """
+
+def get_heap_profile():
+    return tp.query(heap_query).as_pandas_dataframe()
+
+def get_heap_profile2():
+    return tp2.query(heap_query).as_pandas_dataframe()
+
+def get_detailed_thread_state_perfetto():
+
+    query = f"""
+    DROP VIEW IF EXISTS workqueue_slices;
+    CREATE VIEW workqueue_slices AS
+    SELECT
+        slice.ts as ts,
+        slice.dur as dur,
+        slice.id as slice_id,
+        slice.name as slice_name,
+        thread.name as thread_name,
+        thread.utid as utid
+    FROM slice
+    JOIN thread_track ON thread_track.id = slice.track_id
+    JOIN thread USING (utid)
+    WHERE thread.tid < 10000;
+
+    DROP TABLE IF EXISTS slice_thread_state_breakdown;
+    CREATE VIRTUAL TABLE slice_thread_state_breakdown
+    USING SPAN_LEFT_JOIN(
+        workqueue_slices PARTITIONED utid,
+        thread_state PARTITIONED utid
+    );
+
+    SELECT
+        thread_name,
+        slice_name,
+        slice_id,
+        SUM(CASE WHEN state = 'Running' THEN dur ELSE 0 END) as running_interval,
+        SUM(CASE WHEN state = 'R' THEN dur ELSE 0 END) as runnable_interval,
+        SUM(CASE WHEN state = 'S' THEN dur ELSE 0 END) as sleeping_interval
+    FROM slice_thread_state_breakdown
+    GROUP BY slice_id;
+    """
+    return tp.query(query).as_pandas_dataframe()
\ No newline at end of file
diff --git a/ext/orbetto/src/LRUCache.hpp b/ext/orbetto/src/LRUCache.hpp
new file mode 100644
index 0000000..bc7ab14
--- /dev/null
+++ b/ext/orbetto/src/LRUCache.hpp
@@ -0,0 +1,71 @@
+/* 
+ * File:   lrucache.hpp
+ * Author: Alexander Ponomarev
+ *
+ * Created on June 20, 2013, 5:09 PM
+ */
+
+#ifndef _LRUCACHE_HPP_INCLUDED_
+#define	_LRUCACHE_HPP_INCLUDED_
+
+#include <unordered_map>
+#include <list>
+#include <cstddef>
+#include <stdexcept>
+
+namespace cache {
+
+template<typename key_t, typename value_t>
+class lru_cache {
+public:
+	typedef typename std::pair<key_t, value_t> key_value_pair_t;
+	typedef typename std::list<key_value_pair_t>::iterator list_iterator_t;
+
+	lru_cache(size_t max_size) :
+		_max_size(max_size) {
+	}
+	
+	void put(const key_t& key, const value_t& value) {
+		auto it = _cache_items_map.find(key);
+		_cache_items_list.push_front(key_value_pair_t(key, value));
+		if (it != _cache_items_map.end()) {
+			_cache_items_list.erase(it->second);
+			_cache_items_map.erase(it);
+		}
+		_cache_items_map[key] = _cache_items_list.begin();
+		
+		if (_cache_items_map.size() > _max_size) {
+			auto last = _cache_items_list.end();
+			last--;
+			_cache_items_map.erase(last->first);
+			_cache_items_list.pop_back();
+		}
+	}
+	
+	const value_t& get(const key_t& key) {
+		auto it = _cache_items_map.find(key);
+		if (it == _cache_items_map.end()) {
+			throw std::range_error("There is no such key in cache");
+		} else {
+			_cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second);
+			return it->second->second;
+		}
+	}
+	
+	bool exists(const key_t& key) const {
+		return _cache_items_map.find(key) != _cache_items_map.end();
+	}
+	
+	size_t size() const {
+		return _cache_items_map.size();
+	}
+	
+private:
+	std::list<key_value_pair_t> _cache_items_list;
+	std::unordered_map<key_t, list_iterator_t> _cache_items_map;
+	size_t _max_size;
+};
+
+} // namespace cache
+
+#endif	/* _LRUCACHE_HPP_INCLUDED_ */
\ No newline at end of file
diff --git a/ext/orbetto/src/TRACE.md b/ext/orbetto/src/TRACE.md
new file mode 100644
index 0000000..61b6fd9
--- /dev/null
+++ b/ext/orbetto/src/TRACE.md
@@ -0,0 +1,169 @@
+# Mortrall: NuttX instrumentation with Perfetto and Instruction Trace
+
+We can trace the NuttX scheduler at half CPU speed by streaming the ETM debug
+trace over the parallel TRACE pins and capturing it to a file.
+
+The Mortrall class decodes the ETM packets and generates a CallStack that can be
+visualized with Perfetto. This includes thread switching and exception
+detection in order to be able to follow the code execution on the corresponding
+PX4 FMUv5x or FMUv6x.
+
+![](https://github.com/niklaut/orbetto-support-files/blob/main/perfetto_callstack.png)
+
+The structure of the code is based on the mortem module from orbuculum, but has
+been heavily modified to suit our needs. There have also been many fixes in
+loadelf and traceDecoder_etm4.
+
+
+## Capture the trace
+
+Use OrbTrace to enable the TRACE pin output on the FMU.
+
+> [!TIP]  
+> Data accumulates very fast. If you want to see the whole CallStack in
+> Perfetto Trace, trace less than a second.
+
+
+### FMUv5x
+
+Start GDB in your PX4 Autopilot source directory:
+
+```sh
+# cd PX4-Autopilot
+python3 -m emdbg.bench.fmu --target px4_fmu-v5x --orbtrace
+```
+
+Reset the target and start the capture:
+
+```
+(gdb) px4_etm_trace_tpiu_swo_stm32f7
+(gdb) Continue
+```
+
+
+### FMUv6x
+
+Start GDB in your PX4 Autopilot source directory:
+
+```sh
+# cd PX4-Autopilot
+python3 -m emdbg.bench.fmu --target px4_fmu-v6x --orbtrace
+```
+
+Reset the target and start the capture. You might need to power cycle the target:
+
+```
+(gdb) px4_etm_trace_tpiu_swo_stm32h7
+(gdb) Continue
+```
+
+
+### Visualization
+
+To convert the `trace.swo` binary to Perfetto format call Orbetto:
+
+```sh
+# cd embedded-debug-tools/ext/orbetto
+build/orbetto -f path/to/trace.swo -e path/to/fmu.elf -b path/to/fmu_bootloader.elf -t 1
+# Use -v 3 to show advanced debugging data that also prints instructions
+# Use -a to set a cycle count threshold at which verbose printing starts
+```
+
+If the `trace.swo` files contain instruction data, Mortrall is automatically
+initialized and and creates a CallStack in the Protobuf file. You should now
+have an `orbetto.perf` file that you can drag into the
+[Perfetto user interface](https://ui.perfetto.dev).
+
+
+## Limitations
+
+There are a few things you need to be aware of when using this tool.
+
+
+### Hardware
+
+It is  important to know that there are two main configurations when tracing
+instructions. There is implicit and explicit tracing, which can be set in the
+`cortex_m.gdb` file (implicit = `startETM`; explicit = `startETM 1`). With
+implicit tracing, the minimum amount of trace data is sent. This mainly refers
+to the number of address packets, which basically transmit the current PC
+value (there are different formats). With implicit tracing, address packets are
+only sent if there is no other way to follow a branch. With explicit tracing,
+the address is sent after each branch, resulting in a more robust trace that
+can be theoretically be analyzed from any point.
+
+
+#### FMUv5x
+
+Unfortunately, the trace pinouts of the chip are multipurpose. It is very
+important to remove the 3 LEDs from the data lines. Their non-linear
+(I-V) characteristics makes it impossible to trace high frequency instruction
+data. However, if the LEDs are removed, the resulting stubs can cause unwanted
+reflections, which compromises signal integrity. In addition, the trace lines
+are not matched in length. In particular, the `nARMED` pin is also connected to
+one of the trace lines, making it three times the length of all the other
+traces. We have roughly estimated that these problems are not too bad as long
+as you stay under 100MHz when tracing.
+
+The following signals have been observed with an oscilloscope and a clock
+frequency of 50Mhz after removing the LEDs:
+
+![](https://github.com/niklaut/orbetto-support-files/blob/main/SCR17.BMP)
+
+Since the STM32F7 has only a very small trace buffer (no ETF), explicit tracing
+only works if you increase the CPU prescaler in `board.h` to 8 and also disable
+the instruction and data cache to further slow down the CPU. Implicit tracing
+instead only needs the increased prescaler, but still has a few (~50) overflows
+when tracing. This probably only happens when there are a lot of thread
+switches during PX4 initialization, which causes a burst in address and branch
+packets.
+
+
+#### FMUv6x
+
+With FMUv6x (STM32H7), the Skynode hardware issues remain much the same.
+
+However, we now have a 4kByte ETF (Embedded Trace FIFO) inside the STM, which
+buffers the trace data during bursts. In addition, the CPU and trace clock
+sources have different prescalers so that they can be set separately. The trace
+clock must have a prescaler of 16 to stay within the hardware signal integrity
+limits. For the CPU clock we need at least a prescaler of 4 (=half max CPU
+speed) to get a correct call stack with a small amount of overflow. With a
+prescaler of 8 there will be no overflows.
+
+> [!NOTE]  
+> FMUv6x has only been tested with implicit traces.
+
+
+### Perfetto
+
+Due to memory limitations in the Perfetto UI (both local and web interface),
+only a very short trace can be visualized (~25MB of trace data results in
+~300MB of Protobuf).
+
+If you already know which thread you want to monitor, this problem can be solved
+by displaying only the corresponding thread as it is very unlikely that you
+need to see the full CallStack. This is not implemented yet, but can easily be
+added.
+
+As mentioned above, the decoding process in Mortrall makes it possible to trace
+every executed instruction and jump precisely. However, both the Python trace
+processor and the UI have a memory limitation that makes it impossible to use
+this information further (regression testing, fuzzing, etc.) within the
+Protobuf format. To work around this problem for now, an additional library
+called CRoaring has been added. This library encodes all executed program
+counter values into a bitmap. It also has a Python interface for further
+analysis of this data. A description of how to use this can be found in
+[METRICS.md](../metrics/METRICS.md).
+
+> [!NOTE]  
+> Unfortunately, the Perfetto batch processor does not help in this case, as
+> it only allows parallel decoding of multiple Protobuf files (ie. many
+> different traces) and not with a very large one.
+
+
+### NuttX
+
+The computation of the CallStack is tightly coupled to the way NuttX handles
+thread switching. This means that it is very likely that this will not work for
+any other RTOS without modifications.
diff --git a/ext/orbetto/src/device.cpp b/ext/orbetto/src/device.cpp
index 2e9ad7a..3559ebd 100644
--- a/ext/orbetto/src/device.cpp
+++ b/ext/orbetto/src/device.cpp
@@ -308,6 +308,7 @@ Device::Device(std::string_view hint)
         // _register_table = &registers_stm32f765;
         _max_irq = 16+109;
         _clock = 216'000'000;
+        _id = DeviceId::SYKNODE_V5X;
     }
     else if (hint.find("stm32h753") != std::string_view::npos or
              hint.find("v6x") != std::string_view::npos)
@@ -316,5 +317,14 @@ Device::Device(std::string_view hint)
         _register_table = &registers_stm32h753;
         _max_irq = 16+149;
         _clock = 480'000'000;
+        _id = DeviceId::SYKNODE_V6X;
+    }
+    // this was used in a pure NuttX build as ETM test
+    else if (hint.find("nuttx") != std::string_view::npos)
+    {
+        _irq_table = &irq_names_stm32f765;
+        _max_irq = 16+109;
+        _clock = 48'000'000;
+        _id = DeviceId::SYKNODE_V5X;
     }
 }
diff --git a/ext/orbetto/src/device.hpp b/ext/orbetto/src/device.hpp
index a592c46..fbcd2d3 100644
--- a/ext/orbetto/src/device.hpp
+++ b/ext/orbetto/src/device.hpp
@@ -7,6 +7,14 @@
 #include <string_view>
 #include <cstdint>
 
+// enum for skynode device id
+enum DeviceId
+{
+    SYKNODE_V5X = 0,
+    SYKNODE_V6X = 1,
+    SKYNODE_S = 2,
+};
+
 class Device
 {
 public:
@@ -49,9 +57,16 @@ class Device
     clock() const
     { return _clock; }
 
+    /// Provides the device id.
+    int
+    id() const
+    { return (int)_id; }
+
+
 private:
     const IrqTable *_irq_table{nullptr};
     const RegisterTable *_register_table{nullptr};
     uint32_t _clock{0};
     int16_t _max_irq{0};
+    DeviceId _id; // device id: 0
 };
diff --git a/ext/orbetto/src/mortrall.hpp b/ext/orbetto/src/mortrall.hpp
new file mode 100644
index 0000000..5f2b9ce
--- /dev/null
+++ b/ext/orbetto/src/mortrall.hpp
@@ -0,0 +1,1220 @@
+#pragma once
+
+#include <vector>
+#include <protos/perfetto/trace/trace.pb.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <assert.h>
+#include <strings.h>
+#include <string.h>
+#include <signal.h>
+#include <getopt.h>
+#include <iostream>
+#include <fstream>
+#include <deque>
+#include <functional>
+#include <time.h>
+#include <iomanip>
+
+#include "git_version_info.h"
+#include "generics.h"
+#include "nw.h"
+#include "traceDecoder.h"
+#include "tpiuDecoder.h"
+#include "loadelf.h"
+#include "sio.h"
+#include "stream.h"
+
+#include "LRUCache.hpp"
+#include "roaring.hh"
+
+using namespace roaring;
+
+//--------------------------------------------------------------------------------------//
+//--------------------------------- BEGIN REGION Defines -------------------------------//
+//--------------------------------------------------------------------------------------//
+
+#define INTERVAL_TIME_MS    (1000)      /* Intervaltime between acculumator resets */
+#define HANG_TIME_MS        (200)       /* Time without a packet after which we dump the buffer */
+#define TICK_TIME_MS        (100)       /* Time intervals for screen updates and keypress check */
+#define STACK_BUFFER_SIZE   (4096)      /* Size of the stack buffer */
+#define SCRATCH_STRING_LEN  (65535)     /* Max length for a string under construction */
+
+#define MAX_CALL_STACK (30)
+#define DEFAULT_PM_BUFLEN_K (32)
+#define MAX_BUFFER_SIZE (100)
+
+#define CACHE_SIZE (10000)
+
+// struct for caching of capstone dissasembly
+struct capstoneCache
+{
+    char *assembly;
+    enum instructionClass ic;
+    uint32_t newaddr;
+};
+
+/* Materials required to be maintained across callbacks for output construction */
+struct opConstruct
+{
+    uint32_t currentFileindex;           /* The filename we're currently in */
+    struct symbolFunctionStore *currentFunctionptr;       /* The function we're currently in */
+    uint32_t currentLine;                /* The line we're currently in */
+    uint32_t workingAddr;                /* The address we're currently in */
+};
+
+struct CallStack
+{
+    symbolMemaddr stack[MAX_CALL_STACK];    /* Stack of calls */
+    int stackDepth{-1};                     /* Current stack depth */
+    int perfettoStackDepth{-1};             /* Stack depth at which perfetto is currently */
+};
+
+struct RunTime
+{
+    enum TRACEprotocol protocol;        /* Encoding protocol to use */
+    struct TRACEDecoder i;
+
+    struct symbol *s;                   /* Currently used elf */
+    struct symbol *_s;                  /* Symbols read from elf */
+    struct symbol *sb;                  /* Symbols read from bootloader elf*/
+
+    bool bootloader;                    /* Set if we are still using bootloader elf */
+
+    struct opConstruct op;              /* Materials required to be maintained across callbacks for output construction */
+
+    bool traceRunning;                  /* Set if we are currently receiving trace */
+    uint32_t context;                   /* Context we are currently working under */
+
+    bool committed{true};               /* Set if we have committed to the current jump */
+    bool resentStackDel;                /* Possibility to remove an entry from the stack, if address not given */
+
+    bool resentStackSwitch{false};      /* Call stack has been switched in last iteration */
+    bool exceptionEntry{false};         /* Set if we are currently in an exception entry */
+    int exceptionId{0};                 /* Exception ID */
+    bool exceptionActive{false};        /* Set if we are currently in an exception */
+    uint32_t returnAddress{0};          /* Return address for exception */
+
+    uint16_t instruction_count{0};      /* Instruction count for precise timing between cycle count packets (Assume all instructions take equally long) */      
+
+    CallStack *callStack;               /* Pointer to current active call stack */
+    CallStack exceptionCallStack;       /* Separate call stack for exceptions (all exceptions need to convert to depth 0; nested exceptions not implemented)*/
+    CallStack bootloaderCallStack;      /* Call stack for bootloader */
+};
+
+struct CallStackBuffer
+{
+    unsigned int lastCycleCount{0};                                     /* Last cycle count */
+    perfetto::protos::FtraceEvent *proto_buffer[MAX_BUFFER_SIZE];       /* Buffer for protobuf */
+    uint16_t instruction_counts[MAX_BUFFER_SIZE];                       /* Instruction count */
+    uint64_t global_interpolations[MAX_BUFFER_SIZE];                    /* Global timestamps */
+    int proto_buffer_index{0};                                          /* Index for the buffer */
+}csb;
+
+/* Cache to store capstone disassembly for fast access */
+cache::lru_cache<uint32_t, capstoneCache> cache_lru(CACHE_SIZE);
+
+//--------------------------------------------------------------------------------------//
+//---------------------------------- END REGION Defines --------------------------------//
+//--------------------------------------------------------------------------------------//
+
+class Mortrall
+{
+    public:
+        /* Perfetto trace elements */
+        static inline perfetto::protos::Trace *perfetto_trace;
+        static inline perfetto::protos::FtraceEventBundle *ftrace;
+        static constexpr uint32_t PID_CALLSTACK = 400000;
+        static constexpr uint32_t PID_BOOTLOADER = 401000;
+        static constexpr uint32_t PID_EXCEPTION = 500000;
+        static constexpr uint32_t PID_PC = 600000;
+        static inline uint32_t activeCallStackThread;
+        static inline std::unordered_map<int, const char *> exception_names;        // Store interrupt names to give each a unique perfetto thread
+        static inline Roaring r1;
+
+        /* Data Struct to store information about the current decoding process (used in orbmortem) */
+        static inline RunTime *r;
+
+        /* Callstack elements */
+        static inline uint16_t tid;                                                 // current running thread id
+        static inline uint16_t pending_tid;                                         // next thread id (gets set after a thread switch pattern has been detected)
+        static inline std::deque<uint16_t> thread_switches;                         // thread switches from software Pre-Pump
+        static inline int thread_switches_size = 0;                                 // size of thread switches
+        static inline struct symbolFunctionStore *top_thread_func;                  // current running function of the current thread
+        static inline std::map<uint16_t, CallStack> callstacks;                     // CallStack map to store the callStacks of the different threads
+
+        /* Callback function to update timestamp in ITM trace */
+        static inline std::function<void(uint64_t)> update_itm_timestamp;           // callback function to update timestamp in ITM trace
+        static inline std::function<void()> switch_itm_symbols;                     // callback function to switch symbols in ITM trace (elf file)
+
+        /* Mortrall flags and Parameters */
+        static inline bool initialized = false;                                     // Initialization flag
+        static inline bool pending_thread_switch;                                   // Pending thread switch flag    
+        static inline enum verbLevel verbose;                                       // Verbosity level
+        static inline uint64_t cps;                                                 // Clocks per second of used cpu
+        static inline uint64_t cycleCountThreshold;                                 // cycle count threshold from which debugging information is output
+
+        /* Default Constructor */
+        constexpr Mortrall()
+        {
+            ;
+        }
+
+        /* Initialization */
+        void inline init(perfetto::protos::Trace *perfetto_trace,perfetto::protos::FtraceEventBundle *ftrace,uint64_t cps,enum verbLevel v, struct symbol *s,struct symbol *sb, std::function<void(uint64_t)> update_itm_timestamp_input,std::function<void()> switch_itm_symbols, uint64_t ccth)
+        {
+            /* Set all parameters and functions */
+            Mortrall::perfetto_trace = perfetto_trace;
+            Mortrall::ftrace = ftrace;
+            Mortrall::cps = cps;
+            Mortrall::r = new RunTime();
+            Mortrall::r->_s = s;
+            Mortrall::r->sb = sb;
+            Mortrall::callstacks[0] = CallStack();
+            Mortrall::r->exceptionCallStack = CallStack();
+            Mortrall::r->bootloaderCallStack = CallStack();
+            /* Check whether Bootloader is included or not */
+            if(sb)
+            {
+                r->bootloader = true;
+                r->s = sb;
+                Mortrall::r->callStack = &r->bootloaderCallStack;
+                Mortrall::activeCallStackThread = PID_BOOTLOADER;
+            }else
+            {
+                r->bootloader = false;
+                r->s = s;
+                Mortrall::r->callStack = &Mortrall::callstacks[0];
+                Mortrall::activeCallStackThread = PID_CALLSTACK;
+            }
+            Mortrall::pending_thread_switch = false;
+            Mortrall::top_thread_func = NULL;
+            Mortrall::update_itm_timestamp = update_itm_timestamp_input;
+            Mortrall::switch_itm_symbols = switch_itm_symbols;
+            if (v)
+            {
+                Mortrall::verbose = v;
+            }else
+            {
+                Mortrall::verbose = V_ERROR;
+            }
+            Mortrall::cycleCountThreshold = ccth;
+            Mortrall::_init();
+            Mortrall::initialized = true;
+            /* Report successful initialization */
+            _traceReport( V_DEBUG, "Mortrall initialized" EOL);
+        }
+
+        static void inline dumpElement(char element)
+        {
+            /* Input function for decoding instruction trace packets, must be called from outside */
+            if(Mortrall::initialized)
+            {
+                uint8_t byte = (uint8_t)element;
+                TRACEDecoderPump( &r->i, &byte, 1, _traceCB, r );
+            }
+        }
+        
+        void inline finalize(auto *process_tree)
+        {
+            if(Mortrall::initialized)
+            {
+                /* Commit all changes */
+                Mortrall::r->committed = true;
+                /* flush remaining buffer entries */
+                Mortrall::_flush_proto_buffer();
+            }
+            /* Initialize the protobuf categories */
+            Mortrall::_init_protobuf(process_tree);
+            /* Print Debug Information */
+            struct TRACECPUState *cpu = TRACECPUState( &Mortrall::r->i );
+            /* The number of overflows is one metrics that represents the quality of the trace */
+            /* If there are many overflows try implicit tracing */
+            printf("Overflows: %llu - %llu\n",cpu->overflows,cpu->ASyncs);
+            if (cpu->overflows > 0)
+            {
+                printf("Warning: When tracing implicit, it is very likely overflows corrupt the Call Stack.\n");
+            }
+            printf("PC bitmap cardinality = %llu\n", r1.cardinality());
+            // Serialize the bitmap to a file
+            // Step 1: Get the size of the serialized bitmap
+            size_t size_in_bytes = r1.getSizeInBytes();
+
+            // Step 2: Create a buffer (vector of char) to hold the serialized bitmap
+            std::vector<char> buffer(size_in_bytes);
+
+            // Step 3: Serialize the bitmap into the buffer
+            r1.write(buffer.data());
+
+            // Step 4: Write the buffer to a binary file
+            std::ofstream out("bitmap.roar", std::ios::binary);
+            out.write(buffer.data(), buffer.size());
+            out.close();
+            printf("PC bitmap serialized to 'bitmap.roar'\n");
+            delete Mortrall::r;
+        }
+
+        void inline add_thread_switch(uint16_t tid)
+        {
+            /* Callback for itm decoding */
+            Mortrall::thread_switches.push_back(tid);
+            thread_switches_size++;
+        }
+
+    private:
+
+//--------------------------------------------------------------------------------------//
+//-------------------------------- BEGIN REGION Callback -------------------------------//
+//--------------------------------------------------------------------------------------//
+        // global variables for callStack handling in callback
+        static inline bool revertStack = false;
+        static inline bool inconsistent = false;
+        // Callback function if an "interesting" element has been received
+        static void inline _traceCB( void *d )
+        /* Callback function for when valid TRACE decode is detected */
+        {
+            Mortrall::r = ( RunTime * )d;
+            struct TRACECPUState *cpu = TRACECPUState( &Mortrall::r->i );
+            uint32_t incAddr = 0;
+            uint32_t disposition;
+            uint32_t targetAddr = 0; /* Just to avoid unitialised variable warning */
+            bool linearRun = false;
+            enum instructionClass ic;
+            symbolMemaddr newaddr;
+
+
+            /* Check for Cycle Count update to reset instruction count*/
+            if (TRACEStateChanged( &Mortrall::r->i, EV_CH_CYCLECOUNT) )
+            {
+                // Mortrall::_generate_protobuf_cycle_counts();
+                Mortrall::_flush_proto_buffer();
+                Mortrall::r->instruction_count = 0;
+                Mortrall::update_itm_timestamp(cpu->cycleCount);
+                _traceReport( V_DEBUG, "Cc: %lu\n",cpu->cycleCount );
+            }
+
+
+            /* 2: Deal with exception entry */
+            /* ============================ */
+            if ( TRACEStateChanged( &Mortrall::r->i, EV_CH_EX_ENTRY ) )
+            {
+                switch ( Mortrall::r->protocol )
+                {
+                    case TRACE_PROT_ETM4:
+
+                        /* For the ETM4 case we get a new address with the exception indication. This address is the preferred _return_ address, */
+                        /* there will be a further address packet, which is the jump destination, along shortly. Note that _this_ address        */
+                        /* change indication will be consumed here, and won't hit the test below (which is correct behaviour.                    */
+                        if ( !TRACEStateChanged( &Mortrall::r->i, EV_CH_ADDRESS ) )
+                        {
+                            _traceReport( V_DEBUG, "Exception occured without return address specification" );
+                        }
+                        else
+                        {
+                            /* Sometimes an invalid exception address is transmitted. When this happens do not use it as preferred return address. */
+                            /* It seems like this can happen when there just has been a branch instruction before the exception and it is not clear if the jump will be executed*/
+                            /* An invalid address starts has hex: 0xf...*/
+                            /* Later I found out this a specific bug for px4v5x with too high clock frequency which causes packets to be corrupted. */
+                            /* With a perfect setup the else statement should never be executed.*/
+                            if (cpu->addr < 0xf0000000)
+                            {
+                                _appendToOPBuffer( Mortrall::r, NULL, Mortrall::r->op.currentLine, LT_EVENT, "========== Exception Entry (%d (%s) at 0x%08x return to 0x%08x ) ==========",
+                                                cpu->exception, TRACEExceptionName( cpu->exception ), Mortrall::r->op.workingAddr, cpu->addr );
+                                Mortrall::r->returnAddress = cpu->addr;
+                                revertStack = (cpu->addr != Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth]);
+                            }
+                            else
+                            {
+                                _appendToOPBuffer( Mortrall::r, NULL, Mortrall::r->op.currentLine, LT_EVENT, "========== Exception Entry (%d (%s) at 0x%08x with invalid return address (0x%08x) ) ==========",
+                                                cpu->exception, TRACEExceptionName( cpu->exception ), Mortrall::r->op.workingAddr, cpu->addr );
+                                Mortrall::r->returnAddress = Mortrall::r->op.workingAddr;
+                            }
+                            Mortrall::r->exceptionEntry = true;
+                            Mortrall::r->exceptionId = cpu->exception;
+                        }
+                        break;
+
+                    default:
+                        _traceReport( V_DEBUG, "Unrecognised trace protocol in exception handler" );
+                        break;
+                }
+            }
+
+
+            /* 3: Collect flow affecting changes introduced by this event */
+            /* ========================================================== */
+            if ( TRACEStateChanged( &Mortrall::r->i, EV_CH_ADDRESS ) )
+            {
+                /* Make debug report if calculated and reported addresses differ. This is most useful for testing when exhaustive  */
+                /* address reporting is switched on. It will give 'false positives' for uncalculable instructions (e.g. bx lr) but */
+                /* it's a decent safety net to be sure the jump decoder is working correctly.                                      */
+                if ( Mortrall::r->protocol != TRACE_PROT_MTB )
+                {
+                    inconsistent = ( Mortrall::r->op.workingAddr != cpu->addr ) && !r->resentStackSwitch;
+                    _traceReport( V_DEBUG, "%sCommanded CPU Address change (Was:0x%08x Commanded:0x%08x)" EOL,
+                                ( !inconsistent ||  Mortrall::r->exceptionEntry) ? "" : "***INCONSISTENT*** ", Mortrall::r->op.workingAddr, cpu->addr );
+                    _revertStackDel(revertStack,inconsistent);
+                    _catchInconsistencies(inconsistent,cpu->addr);
+                    Mortrall::r->committed = true;              // This is needed to generate only protobuff entries when the jump was commited
+                    r->resentStackSwitch = false;               // Stack switch has been processed
+                    revertStack = false;                        // Reset revertStack
+                }
+                /* As every Exception Packet is followed by an Address Packet we handle the exception here to keep track of thread switches */
+                _handleExceptionEntry();
+                /*  After it is clear to what location the jump happened add the current function to the top of the stack and update in protobuf */
+                _addTopToStack(Mortrall::r,cpu->addr);
+                /* Update the ProtoBuf entries */
+                _generate_protobuf_entries_single(cpu->addr);
+                /* Report the Stack for debugging */
+                _stackReport(Mortrall::r);
+                /* Whatever the state was, this is an explicit setting of an address, so we need to respect it */
+                Mortrall::r->op.workingAddr = cpu->addr;        // Update working address from addr packet
+                Mortrall::r->exceptionEntry = false;            // Reset exception entry flag
+            }
+
+            if ( TRACEStateChanged( &Mortrall::r->i, EV_CH_LINEAR ) )
+            {
+                /* MTB-Specific mechanism: Execute instructions from the marked starting location to the indicated finishing one */
+                /* Disposition is all 1's because every instruction is executed.                                                 */
+                Mortrall::r->op.workingAddr = cpu->addr;
+                targetAddr        = cpu->toAddr;
+                linearRun         = true;
+                disposition       = 0xffffffff;
+                _traceReport( V_DEBUG, "Linear run 0x%08x to 0x%08x" EOL, cpu->addr, cpu->toAddr );
+            }
+
+            if ( TRACEStateChanged( &Mortrall::r->i, EV_CH_ENATOMS ) )
+            {
+                /* Atoms represent instruction steps...some of which will have been executed, some stepped over. The number of steps is the   */
+                /* total of the eatoms (executed) and natoms (not executed) and the disposition bitfield shows if each individual instruction */
+                /* was executed or not. For ETM3 each 'run' of instructions is a single instruction with the disposition bit telling you if   */
+                /* it was executed or not. For ETM4 each 'run' of instructions is from the current address to the next possible change of     */
+                /* program flow (and which point the disposition bit tells you if that jump was taken or not).                                */
+                incAddr = cpu->eatoms + cpu->natoms;
+                disposition = cpu->disposition;
+            }
+
+            /* 4: Execute the flow instructions */
+            /* ================================ */
+            while ( ( incAddr && !linearRun ) || ( ( Mortrall::r->op.workingAddr <= targetAddr ) && linearRun ) )
+            {
+                /* Firstly, lets get the source code line...*/
+                struct symbolLineStore *l = symbolLineAt( Mortrall::r->s, Mortrall::r->op.workingAddr );
+                struct symbolFunctionStore *func = symbolFunctionAt( Mortrall::r->s, Mortrall::r->op.workingAddr );
+
+                /* To be able to detect thread switches mortrall needs to be combined with itm tracing */
+                /* Check whether itm detected a thread switch */
+                _detect_thread_switch_pattern(func);
+
+                /* To always have the correct running/top function on the stack update after every instruction */
+                _addTopToStack(Mortrall::r,Mortrall::r->op.workingAddr);
+                /* Then print this update in perfetto */
+                _generate_protobuf_entries_single(Mortrall::r->op.workingAddr);
+
+                // New Atom Packets has been received therefore any stack changes are definitive
+                Mortrall::r->resentStackDel = false;        // Stack delete has been processed
+
+                if ( func )
+                {
+                    /* There is a valid function tag recognised here. If it's a change highlight it in the output. */
+                    if ( ( func->filename != Mortrall::r->op.currentFileindex ) || ( func != Mortrall::r->op.currentFunctionptr ) )
+                    {
+                        _appendToOPBuffer( Mortrall::r, l, Mortrall::r->op.currentLine, LT_FILE, "%s::%s", symbolGetFilename( Mortrall::r->s, func->filename ), func->funcname );
+                        Mortrall::r->op.currentFileindex     = func->filename;
+                        Mortrall::r->op.currentFunctionptr = func;
+                        Mortrall::r->op.currentLine = NO_LINE;
+                    }
+                }
+                else
+                {
+                    /* We didn't find a valid function, but we might have some information to work with.... */
+                    if ( ( NO_FILE != Mortrall::r->op.currentFileindex ) || ( NULL != Mortrall::r->op.currentFunctionptr ) )
+                    {
+                        _appendToOPBuffer( Mortrall::r, l, Mortrall::r->op.currentLine, LT_FILE, "Unknown function" );
+                        Mortrall::r->op.currentFileindex     = NO_FILE;
+                        Mortrall::r->op.currentFunctionptr = NULL;
+                        Mortrall::r->op.currentLine = NO_LINE;
+                    }
+                }
+
+                /* If we have changed line then output the new one */
+                if ( l && ( ( l->startline != Mortrall::r->op.currentLine ) ) )
+                {
+                    Mortrall::r->op.currentLine = l->startline;
+                }
+
+                /* This part caches the function symbolDisassembleLine with a lru-cache, which improves runtime heavily */
+                char *a = NULL;
+                if(cache_lru.exists(Mortrall::r->op.workingAddr))
+                {
+                    auto val = cache_lru.get(Mortrall::r->op.workingAddr);
+                    a = (char *)malloc(strlen(val.assembly) + 1);
+                    strcpy(a,val.assembly);
+                    ic = val.ic;
+                    newaddr = val.newaddr;
+                }
+                else
+                {
+                    /* Use capstone to disassemble current instruction and assign instructions types to ic */
+                    a = symbolDisassembleLine( Mortrall::r->s, &ic, Mortrall::r->op.workingAddr, &newaddr);
+                    if (a)
+                    {
+                        struct capstoneCache *cs_element = new capstoneCache();
+                        cs_element->assembly = (char *)malloc(strlen(a) + 1);
+                        strcpy(cs_element->assembly,a);
+                        cs_element->ic = ic;
+                        cs_element->newaddr = newaddr;
+                        cache_lru.put(Mortrall::r->op.workingAddr, *cs_element);
+                    }
+                }
+                
+                if ( a )
+                {
+                    _add_pc(Mortrall::r->op.workingAddr);
+                    /* Calculate if this instruction was executed. This is slightly hairy depending on which protocol we're using;         */
+                    /*   * ETM3.5: Instructions are executed based on disposition bit (LSB in disposition word)                            */
+                    /*   * ETM4  : ETM4 everything up to a branch is executed...decision about that branch is based on disposition bit     */
+                    /*   * MTB   : Everything except jumps are executed, jumps are executed only if they are the last instruction in a run */
+                    bool insExecuted = (
+                                                /* ETM3.5 case - dependent on disposition */
+                                                ( ( !linearRun )  && ( Mortrall::r->i.protocol == TRACE_PROT_ETM35 ) && ( disposition & 1 ) ) ||
+
+                                                /* ETM4 case - either not a branch or disposition is 1 */
+                                                ( ( !linearRun ) && ( Mortrall::r->i.protocol == TRACE_PROT_ETM4 ) && ( ( !( ic & LE_IC_JUMP ) ) || ( disposition & 1 ) ) ) ||
+
+                                                /* MTB case - a linear run to last address */
+                                                ( ( linearRun ) && ( Mortrall::r->i.protocol == TRACE_PROT_MTB ) &&
+                                                    ( ( ( Mortrall::r->op.workingAddr != targetAddr ) && ( ! ( ic & LE_IC_JUMP ) ) )  ||
+                                                    ( Mortrall::r->op.workingAddr == targetAddr )
+                                                    ) ) );
+                    _appendToOPBuffer( Mortrall::r, l, Mortrall::r->op.currentLine, insExecuted ? LT_ASSEMBLY : LT_NASSEMBLY, a );
+                    /* Count instructions fot later interpolating between cycle count packets*/
+                    if(insExecuted)
+                    {
+                        Mortrall::r->instruction_count++;
+                    }
+                    /* Move addressing along */
+                    if ( ( Mortrall::r->i.protocol != TRACE_PROT_ETM4 ) || ( ic & LE_IC_JUMP ) || (ic & LE_IC_SYNC_BARRIER) )
+                    {
+                        if ( Mortrall::r->i.protocol == TRACE_PROT_ETM4 )
+                        {
+                            _traceReport( V_DEBUG, "Consumed, %sexecuted (%d left)", insExecuted ? "" : "not ", incAddr - 1 );
+                        }
+
+                        disposition >>= 1;
+                        incAddr--;
+                    }
+
+                    if ( ic & LE_IC_CALL )
+                    {
+                        if ( insExecuted )
+                        {
+                            /* Push the instruction after this if it's a subroutine or ISR */
+                            _addRetToStack( Mortrall::r, Mortrall::r->op.workingAddr + ( ( ic & LE_IC_4BYTE ) ? 4 : 2 ));
+                            /* Add the called function on top of stack */
+                            _addTopToStack(Mortrall::r,newaddr);
+                            /* Print the callStack for debugging */
+                            _traceReport( V_DEBUG, "Call to %08x", newaddr );
+                            _stackReport(Mortrall::r);
+                        }
+
+                        Mortrall::r->op.workingAddr = insExecuted ? newaddr : Mortrall::r->op.workingAddr + ( ( ic & LE_IC_4BYTE ) ? 4 : 2 );
+                    }
+                    else if ( ic & LE_IC_JUMP )
+                    {
+                        _traceReport( V_DEBUG, "%sTAKEN JUMP", insExecuted ? "" : "NOT " );
+
+                        if ( insExecuted )
+                        {
+                            /* Update working address according to if jump was taken */
+                            if ( ic & LE_IC_IMMEDIATE )
+                            {
+                                /* This is a jump, so we need to use the target address */
+                                _traceReport( V_DEBUG, "Immediate address %8x", newaddr );
+                                Mortrall::r->op.workingAddr = newaddr;
+                            }
+                            else
+                            {
+                                if(!_handleExceptionExit(func))
+                                {
+                                    /* We didn't get the address, so need to park the call stack address if we've got one. Either we won't      */
+                                    /* get an address (in which case this one was correct), or we wont (in which case, don't unstack this one). */
+                                    if ( Mortrall::r->callStack->stackDepth )
+                                    {
+                                        Mortrall::r->op.workingAddr= Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth - 1];
+                                        _traceReport( V_DEBUG, "Return with stacked candidate to %08x", Mortrall::r->op.workingAddr );
+                                    }
+                                    else
+                                    {
+                                        _traceReport( V_DEBUG, "Return with no stacked candidate" );
+                                    }
+                                    Mortrall::r->committed = false;
+                                    Mortrall::r->resentStackDel = true;
+                                    _removeRetFromStack(Mortrall::r);
+                                }
+                            }
+                            _stackReport(Mortrall::r);
+                        }
+                        else
+                        {
+                            /* The branch wasn't taken, so just move along */
+                            Mortrall::r->op.workingAddr += ( ic & LE_IC_4BYTE ) ? 4 : 2;
+                        }
+                    }
+                    else if (ic & LE_IC_SYNC_BARRIER)
+                    {
+                        /* This is a sync barrier, it has its own Atom packet like a jump instruction */
+                        _traceReport( V_DEBUG, "Sync Barrier. \n");
+                        Mortrall::r->op.workingAddr += ( ic & LE_IC_4BYTE ) ? 4 : 2;
+                    }
+                    else
+                    {
+                        /* Just a regular instruction, so just move along */
+                        Mortrall::r->op.workingAddr += ( ic & LE_IC_4BYTE ) ? 4 : 2;
+                    }
+                }
+                else
+                {
+                    /* If it is the first time assembly is not found switch elf file because we exceeded the address range of bootloader elf*/
+                    /* Check if trace is still executing bootloader */
+                    if (r->bootloader)
+                    {
+                        /* Switch to default elf file if the new address is available */
+                        if (symbolDisassembleLine( r->_s, &ic, Mortrall::r->op.workingAddr, &newaddr))
+                        {
+                            /* switch symbols (elf File) */
+                            r->s = r->_s;
+                            /* Start with main_thread by default */
+                            Mortrall::tid = 0;
+                            /* Set the callStack to the main_thread */
+                            r->callStack = &callstacks[Mortrall::tid];
+                            Mortrall::activeCallStackThread = PID_CALLSTACK + Mortrall::tid;
+                            /* Add the first function to the top of the stack */
+                            _addTopToStack(Mortrall::r,cpu->addr);
+                            /* Update the ProtoBuf entries */
+                            _generate_protobuf_entries_single(cpu->addr);
+                            /* reset bootloader flag */
+                            r->bootloader = false;
+                            /* switch symbols for itm trace as well */
+                            Mortrall::switch_itm_symbols();
+                            /* Print Debug Information */
+                            _traceReport( V_DEBUG, "*** BOOTLOADER FINISHED *** \n");
+                        }
+                    }else
+                    {
+                        _appendToOPBuffer( Mortrall::r, l, Mortrall::r->op.currentLine, LT_ASSEMBLY, "%8x:\tASSEMBLY NOT FOUND" EOL, Mortrall::r->op.workingAddr );
+                        Mortrall::r->op.workingAddr += 2;
+                        disposition >>= 1;
+                        incAddr--;
+                    }
+                }
+            }
+        }
+
+//--------------------------------------------------------------------------------------//
+//--------------------------------- END REGION Callback --------------------------------//
+//--------------------------------------------------------------------------------------//
+
+//--------------------------------------------------------------------------------------//
+//---------------------------------- BEGIN REGION Init ---------------------------------//
+//--------------------------------------------------------------------------------------//
+
+        static void inline _init()
+        {
+            /* Init ETM4 Decoder */
+            TRACEprotocol trp = TRACE_PROT_ETM4;
+            Mortrall::Mortrall::r->protocol = trp;
+            TRACEDecoderInit( &Mortrall::Mortrall::r->i, trp, true, _traceReport );
+            /* Init Debug counters */
+            r->i.cpu.ASyncs = 0;
+            r->i.cpu.overflows = 0;
+        }
+        static void inline _init_protobuf(auto *process_tree)
+        {
+            {
+                auto *process = process_tree->add_processes();
+                process->set_pid(PID_CALLSTACK);
+                process->add_cmdline("CallStack");
+                // loop over callstacks
+                for (auto const& [key, val] : Mortrall::callstacks)
+                {
+                    auto *thread = process_tree->add_threads();
+                    thread->set_tid(PID_CALLSTACK + key);
+                    thread->set_tgid(PID_CALLSTACK);
+                    thread->set_name("Thread");
+                }
+                // Add additional thread for bootloader
+                auto *thread = process_tree->add_threads();
+                thread->set_tid(PID_BOOTLOADER);
+                thread->set_tgid(PID_CALLSTACK);
+                thread->set_name("Bootloader");
+            }
+            {
+                auto *process = process_tree->add_processes();
+                process->set_pid(PID_EXCEPTION);
+                process->add_cmdline("EXCEPTIONS");
+                for (auto&& [tid, name] : exception_names)
+                {
+                    auto *thread = process_tree->add_threads();
+                    thread->set_tid(PID_EXCEPTION + tid);
+                    thread->set_tgid(PID_EXCEPTION);
+                    thread->set_name(name);
+                }
+            }
+            {
+                auto *process = process_tree->add_processes();
+                process->set_pid(PID_PC);
+                process->add_cmdline("PC");
+                auto *thread = process_tree->add_threads();
+                thread->set_tid(PID_PC);
+                thread->set_tgid(PID_PC);
+                thread->set_name("PC");
+            }
+        }
+
+//--------------------------------------------------------------------------------------//
+//----------------------------------- END REGION Init ----------------------------------//
+//--------------------------------------------------------------------------------------//
+
+//--------------------------------------------------------------------------------------//
+//-------------------------------- BEGIN REGION Perfetto -------------------------------//
+//--------------------------------------------------------------------------------------//
+
+        static void inline _add_pc(uint32_t addr)
+        {
+            r1.add(addr);
+        }
+
+        static void inline _appendTOProtoBuffer(auto *event, int offset = 0)
+        {
+            /* As the instruction count interpolation cannot be applied before the next cycle count is received store the event in a buffer */
+            csb.proto_buffer[csb.proto_buffer_index] = event;
+            csb.instruction_counts[csb.proto_buffer_index] = Mortrall::r->instruction_count + offset;
+            if (Mortrall::r->i.cpu.cycleCount == COUNT_UNKNOWN && Mortrall::r->callStack->perfettoStackDepth == -1){
+                /* The first packet sometimes still has an unknown cycle count however it is actually 0 */
+                csb.global_interpolations[csb.proto_buffer_index] = 0;
+            }else
+            {
+                csb.global_interpolations[csb.proto_buffer_index] = Mortrall::r->i.cpu.cycleCount;
+            }
+            /* Increment Index */
+            csb.proto_buffer_index++;
+            /* Update perfetto stack depth by a maximum of 1 */
+            if (Mortrall::r->callStack->stackDepth < Mortrall::r->callStack->perfettoStackDepth){
+                Mortrall::r->callStack->perfettoStackDepth--;
+            }else if (Mortrall::r->callStack->stackDepth > Mortrall::r->callStack->perfettoStackDepth){
+                Mortrall::r->callStack->perfettoStackDepth++;
+            }
+            /* Check for buffer overflow */
+            if (csb.proto_buffer_index == MAX_BUFFER_SIZE)
+            {
+                Mortrall::_flush_proto_buffer();
+            }
+        }
+
+        static void inline _handleInconsistentFunctionSwitch(struct symbolFunctionStore *next_func, uint32_t addr)
+        {
+            /*
+                This function inserts a stop and start element into the protobuf trace to compensate for function switches
+                within the same level of the call stack. This should not happen with a perfect instruction trace but seems 
+                unavoidable with the current trace data.
+            */
+            if(Mortrall::r->callStack->stackDepth >= 0)
+            {
+                /* First end current function and append to proto buffer */ 
+                auto *event = ftrace->add_event();      // create Ftrace event
+                auto *print = event->mutable_print();
+                char buffer[80];
+                event->set_pid(Mortrall::activeCallStackThread);        // set the pid of the event
+                snprintf(buffer, sizeof(buffer), "E|0");
+                print->set_buf(buffer);
+                _appendTOProtoBuffer(event);
+
+                /* Second begin current function and append to proto buffer */
+                event = ftrace->add_event();        // create Ftrace event
+                print = event->mutable_print();         // add print
+                event->set_pid(Mortrall::activeCallStackThread);        // set the pid of the event
+                snprintf(buffer, sizeof(buffer), "B|0|%s", next_func->funcname);
+                print->set_buf(buffer);
+                _appendTOProtoBuffer(event,1);      // Use an offset of 1 to compensate for the missing instruction count
+            }
+        }
+
+        static bool inline _inconsistentFunctionSwitch(uint32_t addr)
+        {
+            /* Check whether a function switch on the same stack depth happend */
+            if((int)Mortrall::r->callStack->stackDepth == Mortrall::r->callStack->perfettoStackDepth){
+                struct symbolFunctionStore *next_func = symbolFunctionAt( Mortrall::r->s, addr );
+                if(top_thread_func && next_func && strcmp(next_func->funcname, top_thread_func->funcname))
+                {
+                        _traceReport( V_DEBUG, "Inconsistent function switch detected between functions: %s and %s\n", next_func->funcname, top_thread_func->funcname);
+                        /* Switch the functions in perfetto trace */
+                        _handleInconsistentFunctionSwitch(next_func,addr);
+                        /* Update top thread function after handling the inconsistency*/
+                        Mortrall::top_thread_func = symbolFunctionAt( Mortrall::r->s, Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth] );
+                }
+                return true;
+            }
+            return false;
+        }
+
+        static void inline _generate_protobuf_entries_single(uint32_t addr)
+        {
+            /* Check whether the stack hight changed and if yes if the changes are commited*/
+            if(!_inconsistentFunctionSwitch(addr) &&  Mortrall::r->committed)
+            {
+                /* Create Ftrace event */
+                auto *event = ftrace->add_event();
+                auto *print = event->mutable_print();
+                char buffer[80];
+                /* Get the function at the current address */
+                Mortrall::top_thread_func = symbolFunctionAt( Mortrall::r->s, Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth] );
+                /* Check if the call stack reduced or increased */
+                if(((int)Mortrall::r->callStack->stackDepth > Mortrall::r->callStack->perfettoStackDepth))
+                {
+                    /* set the pid of the event */
+                    event->set_pid(Mortrall::activeCallStackThread);
+                    if (top_thread_func)
+                    {
+                        snprintf(buffer, sizeof(buffer), "B|0|%s", Mortrall::top_thread_func->funcname);
+                    }else
+                    {
+                        snprintf(buffer, sizeof(buffer), "B|0|0x%08x", Mortrall::r->op.workingAddr);
+                    }
+                }
+                else if(((int)Mortrall::r->callStack->stackDepth < Mortrall::r->callStack->perfettoStackDepth))
+                {
+                    /* set the pid of the event */
+                    event->set_pid(Mortrall::activeCallStackThread);
+                    snprintf(buffer, sizeof(buffer), "E|0");
+                }
+                print->set_buf(buffer);
+                /* append to buffer to set the timestamp on the next cycle count packet */
+                _appendTOProtoBuffer(event);
+            }
+        }
+
+        static void inline _generate_protobuf_cycle_counts()
+        {
+            /* Create Ftrace print event for cycle count packets */
+            /* This is mainly for debugging and not needed for correct displaying of the callstack */
+            auto *event = ftrace->add_event();
+            uint64_t ns = (uint64_t)(((Mortrall::r->i.cpu.cycleCount * 1'000'000'000)/ Mortrall::cps)-1);
+            event->set_timestamp(ns);
+            event->set_pid(activeCallStackThread);
+            auto *print = event->mutable_print();
+            char buffer[40];
+            snprintf(buffer, sizeof(buffer), "I|0|CC: %llu",Mortrall::r->i.cpu.cycleCount);
+            print->set_buf(buffer);
+        }
+
+        static double inline _get_ic_percentage(int i)
+        {
+            /* Compute the relative times between two cycle count packets depending on instruction count*/
+            uint16_t ic = csb.instruction_counts[i];
+            double ret = 0;
+            if (ic != 0)
+            {
+                    ret = ((double)ic/(double)Mortrall::r->instruction_count) * (Mortrall::r->i.cpu.cycleCount - csb.lastCycleCount);
+            }
+            return ret;
+        }
+
+        static inline uint64_t perf_prev_ns = 0;
+        static void inline _flush_proto_buffer()
+        {
+            /* Clear buffer and place all trace packets on their relative timestamp */
+            for (int i = 0; i < csb.proto_buffer_index; i++)
+            {
+                auto *event = csb.proto_buffer[i];
+                uint64_t interpolation = csb.global_interpolations[i] + (uint64_t)_get_ic_percentage(i);
+                uint64_t ns = (uint64_t)((interpolation * 1'000'000'000) / Mortrall::cps);
+                /* There cannot be two events at the same timestamp, therefore check if ns is smaller than previous ns */
+                /* Note: this should not be necessary with a perfect instruction trace however */
+                if(perf_prev_ns >= ns){
+                    ns = perf_prev_ns + 1;
+                }
+                perf_prev_ns = ns;
+                event->set_timestamp(ns);
+            }
+            /* Reset index after flushing */
+            csb.proto_buffer_index = 0;
+            csb.lastCycleCount = Mortrall::r->i.cpu.cycleCount;
+        }
+
+//--------------------------------------------------------------------------------------//
+//--------------------------------- END REGION Perfetto --------------------------------//
+//--------------------------------------------------------------------------------------//
+
+//--------------------------------------------------------------------------------------//
+//-------------------------------- BEGIN REGION CallStack ------------------------------//
+//--------------------------------------------------------------------------------------//
+
+        static void inline _handleExceptionEntry()
+        {
+            /* Handle the context switch after a exception */
+            if (Mortrall::r->exceptionEntry)
+            {
+                Mortrall::_generate_protobuf_cycle_counts();
+                /* Check if exception Id is in the map */
+                if(!Mortrall::exception_names.contains(Mortrall::r->exceptionId))
+                {
+                    Mortrall::exception_names[Mortrall::r->exceptionId] = TRACEExceptionName(Mortrall::r->exceptionId);
+                }
+                /* Flush remaining perfetto events before switching context */
+                _generate_protobuf_entries_single(Mortrall::r->op.workingAddr);
+                _flush_proto_buffer();
+                _traceReport( V_DEBUG, "*** THREAD SWITCH *** (to exception: %u)" , Mortrall::r->exceptionId);
+                /* Switch current call stack to exception call stack */
+                Mortrall::r->callStack = &r->exceptionCallStack;
+                Mortrall::activeCallStackThread = Mortrall::PID_EXCEPTION + Mortrall::r->exceptionId;
+                /* Set exception flag */
+                r->exceptionActive = true;
+            }
+        }
+
+        static bool inline _handleExceptionExit(struct symbolFunctionStore *func)
+        {
+            /* As there is no exception exit packet the exit has to be detected by ending of the "arm_exception" function */
+            /* If highaddr is reached its the end of exception (highaddr might be offset by a 1/2 byte) */
+            if(r->exceptionActive && func && strstr(func->funcname,"arm_exception") && (Mortrall::r->op.workingAddr >= (func->highaddr - 0x1f))&& (Mortrall::r->op.workingAddr <= (func->highaddr + 0xf)))
+            {
+                Mortrall::_generate_protobuf_cycle_counts();
+                /* Clear the exception call stack */
+                _removeRetFromStack(Mortrall::r);
+                _generate_protobuf_entries_single(Mortrall::r->op.workingAddr);
+                _flush_proto_buffer();
+                _stackReport(Mortrall::r);
+                /* Check if the current elf file is still the bootloader */
+                if(r->bootloader){
+                    /* Switch to bootloader */
+                    _traceReport( V_DEBUG, "*** THREAD SWITCH *** (to tid: %u)" , Mortrall::PID_BOOTLOADER);
+                    /* set the current call stack in runtime */
+                    Mortrall::r->callStack = &r->bootloaderCallStack;
+                    Mortrall::activeCallStackThread = Mortrall::PID_BOOTLOADER;
+                    /* Reset pending thread switch flag */
+                    Mortrall::pending_thread_switch = false;
+                    /* Set stack switch flag */
+                    r->resentStackSwitch = true;
+                    /* Reset exception flag */
+                    r->exceptionActive = false;
+                    /* Debug print Callstack */
+                    _stackReport(Mortrall::r);
+                    return true;
+                }
+                /* Switch to new call stack */
+                /* Check if we switched threads or are just returning from the exception to the old callStack */
+                if (Mortrall::pending_thread_switch){
+                    Mortrall::tid = Mortrall::pending_tid;
+                }
+                _traceReport( V_DEBUG, "*** THREAD SWITCH *** (to tid: %u)" , Mortrall::tid);
+                /* set the current call stack in runtime */
+                Mortrall::r->callStack = &Mortrall::callstacks[Mortrall::tid];
+                Mortrall::activeCallStackThread = Mortrall::PID_CALLSTACK + Mortrall::tid;
+                /* Reset pending thread switch flag */
+                Mortrall::pending_thread_switch = false;
+                /* Set stack switch flag */
+                r->resentStackSwitch = true;
+                /* Reset exception flag */
+                r->exceptionActive = false;
+                /* Debug print Callstack */
+                _stackReport(Mortrall::r);
+                return true;
+            }
+            return false;
+        }
+
+
+        static void inline _revertStackDel(bool revertStack, bool inconsistent)
+        {
+            /*
+                There are multiple reasons why we would want to revert a stack deletion:
+                1. There has been a jump instruction without immediate address this means we have taken a jump but are not sure where we jumped until the next addr packet
+                2. There has been a exception right after a jump instruction this means it is not clear if the jump will be executed or the exception interrupted
+            */
+            if ( Mortrall::r->resentStackDel && (revertStack || (inconsistent && !Mortrall::r->exceptionEntry)))
+            {
+                _traceReport( V_DEBUG, "Stack delete reverted" );
+                if ( Mortrall::r->callStack->stackDepth < MAX_CALL_STACK - 1 )
+                {
+                    Mortrall::r->callStack->stackDepth++;
+                }
+            }
+        }
+
+        static void inline _catchInconsistencies(bool inconsistent, uint32_t addr)
+        {
+            /* Check if we the took the right branch after receiving an address packet.*/
+            /* This should not be necessary with a perfect trace and should ideally be removed */
+            if (inconsistent)
+            {
+                /* The return can be removed if and explicit trace has been recorded. (needs to be set in gdb.init)*/
+                /* If removed makes it can make the decoding more robust but might create some inconsistencies and jumps in the callStack */
+                return;
+            }
+            if(Mortrall::r->callStack->stackDepth > 0)
+            {
+                /* Loop over the callstack stack and check for duplicate entries */
+                struct symbolFunctionStore *new_func = symbolFunctionAt( Mortrall::r->s, addr);
+                for (int i = (Mortrall::r->callStack->stackDepth - 1); i >= 0; i--)
+                {
+                    /* Check if function name address is the same as the new address */
+                    struct symbolFunctionStore *current_func = symbolFunctionAt( Mortrall::r->s, Mortrall::r->callStack->stack[i]);
+                    if(current_func != NULL && new_func != NULL && strcmp(current_func->funcname,new_func->funcname) == 0)
+                    {
+                        /* Precise Debug prints to be able to reconstruct the code logic */
+                        _traceReport( V_DEBUG, "Inconsistency has been caught and reverted at stack depth: %i", i);
+                        _traceReport( V_DEBUG, "New Func: %s [%08x]" , new_func->funcname, addr);
+                        _traceReport( V_DEBUG, "Current Func: %s [%08x]" , current_func->funcname, Mortrall::r->callStack->stack[i]);
+                        for (int j = (Mortrall::r->callStack->stackDepth - 1); j >= i; j--)
+                        {
+                            /* Collapse stack until the duplicate function is reached*/
+                            _removeRetFromStack(Mortrall::r);
+                            /* Commit the changes */
+                            Mortrall::r->committed = true;
+                            /* Update protobuf trace*/
+                            _generate_protobuf_entries_single(addr);
+                        }
+                        _stackReport(Mortrall::r);
+                    }
+                }
+            }
+        }
+
+        static void inline displayProgressBar(float progress) {
+            /* Basic c++ progress bar implementation */
+            std::cout << "[";
+            int pos = 50 * progress;
+            for (int i = 0; i < 50; ++i) {
+                if (i < pos) std::cout << "=";
+                else if (i == pos) std::cout << ">";
+                else std::cout << " ";
+            }
+            std::cout << "] " << std::fixed << std::setprecision(2) << (progress * 100.0) << " %\r";
+            std::cout.flush();
+        }
+
+        static void inline _detect_thread_switch_pattern(struct symbolFunctionStore *func)
+        {
+            /* Detect if NuttX is about to switch context */
+            /* Hint: This would need to be changed for work for other RTOS */
+            if(!Mortrall::pending_thread_switch && func && (strcmp(func->funcname,"sched_note_resume") == 0))
+            {
+                int remaining_switches = thread_switches.size();
+                if (remaining_switches == 0)
+                {
+                    /* If there are no more thread switches left return. */
+                    /* If this happens decoding was not correct */
+                    _traceReport( V_DEBUG, "No more thread switches." );
+                    return;
+                }
+                /* Switch to next thread id computed by prePumping the trace file */
+                Mortrall::pending_tid = Mortrall::thread_switches.front();
+                Mortrall::thread_switches.pop_front();
+                /* Set pending thread switch flag */
+                Mortrall::pending_thread_switch = true;
+                /* Debug print thread switch */
+                _traceReport( V_DEBUG, "Thread switch pattern detected with pending tid: %u" , Mortrall::pending_tid);
+                /* Display the progress bar depending on how many context switches are left */
+                displayProgressBar((float)(thread_switches_size-thread_switches.size()) / (float)thread_switches_size);
+            }
+            /* Set commited flag */
+            Mortrall::r->committed = true;
+        }
+
+        static void inline _addRetToStack( RunTime *r, symbolMemaddr p , int num = 0)
+        {
+            /* Check if Stack is full */
+            if ( Mortrall::r->callStack->stackDepth == MAX_CALL_STACK - 1 )
+            {
+                /* Make room for a new entry */
+                memmove( &Mortrall::r->callStack->stack[0], &Mortrall::r->callStack->stack[1], sizeof( symbolMemaddr ) * ( MAX_CALL_STACK - 1 ) );
+            }
+
+            /* Add address to stack */
+            Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth] = p;
+            /* Debug print */
+            _traceReport( V_DEBUG, "Pushed %08x to return stack", Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth]);
+            /* Increment stack depth */
+            if ( Mortrall::r->callStack->stackDepth < MAX_CALL_STACK - 1 )
+            {
+                Mortrall::r->callStack->stackDepth++;
+            }
+        }
+        static void inline _removeRetFromStack(RunTime *r)
+        {
+            /* Decrease callStack */
+            /* If Stack depth is '-1' the stack is uninitialized */ 
+            /* This needs to be the case for every exception callStack after it is exited */
+            if ( Mortrall::r->callStack->stackDepth >= 0 )
+            {
+                Mortrall::r->callStack->stackDepth--;
+                _traceReport( V_DEBUG, "Popped %08x from return stack", Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth]);
+            }
+        }
+        static void inline _addTopToStack(RunTime *r,symbolMemaddr p)
+        {
+            /* If the stack is uninitialized set the stack depth to 0 */
+            if ( Mortrall::r->callStack->stackDepth == -1)
+            {
+                Mortrall::r->callStack->stackDepth = 0;
+            }
+            /* Just update the top address without changing the stack depth*/
+            if ( Mortrall::r->callStack->stackDepth < MAX_CALL_STACK - 1 )
+            {
+                Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth] = p;
+            }
+        }
+
+//--------------------------------------------------------------------------------------//
+//--------------------------------- END REGION CallStack -------------------------------//
+//--------------------------------------------------------------------------------------//
+
+
+//--------------------------------------------------------------------------------------//
+//-------------------------------- BEGIN REGION Report ---------------------------------//
+//--------------------------------------------------------------------------------------//
+        static inline bool debug_flag = false;
+        static void inline _appendToOPBuffer( struct RunTime *r, void *dat, int32_t lineno, enum LineType lt, const char *fmt, ... )
+        /* Add line to output buffer, in a printf stylee */
+        {
+            /* Only append if verbose is actually wanted to save execution time */
+            if ( Mortrall::verbose != V_DEBUG )
+            {
+                return;
+            }
+            char construct[SCRATCH_STRING_LEN];
+            va_list va;
+            char *p;
+
+            va_start( va , fmt );
+            vsnprintf( construct, SCRATCH_STRING_LEN, fmt, va );
+            va_end( va );
+
+            /* Make sure we didn't accidentially admit a CR or LF */
+            for ( p = construct; ( ( *p ) && ( *p != '\n' ) && ( *p != '\r' ) ); p++ );
+
+            *p = 0;
+            
+            /* Relate the reporting to cycle count which can be set by an argument to get debug information for certain parts of the trace */
+            if(Mortrall::r->i.cpu.cycleCount >= cycleCountThreshold)
+            {
+                genericsReport( V_DEBUG, "%s" EOL, construct );
+            }
+            if(Mortrall::r->i.cpu.cycleCount == cycleCountThreshold && !debug_flag)
+            {
+                debug_flag = true;
+                genericsReport( V_INFO, "Debug Flag set." EOL );
+            }
+        }
+        static void inline _traceReport( enum verbLevel l, const char *fmt, ... )
+        /* Debug reporting stream */
+        {
+            /* Only append if verbose is actually wanted to save execution time */
+            if ( Mortrall::verbose != V_DEBUG )
+            {
+                return;
+            }
+            static char op[SCRATCH_STRING_LEN];
+            va_list va;
+            va_start( va, fmt );
+            vsnprintf( op, SCRATCH_STRING_LEN, fmt, va );
+            va_end( va );
+
+            /* Relate the reporting to cycle count which can be set by an argument to get debug information for certain parts of the trace */
+            if(Mortrall::r->i.cpu.cycleCount >= cycleCountThreshold)
+            {
+                genericsReport( V_DEBUG, "%s" EOL, op );
+            }
+        }
+        static void inline _stackReport(RunTime *r)
+        {
+            /* Only append if verbose is actually wanted to save execution time */
+            if ( Mortrall::verbose != V_DEBUG )
+            {
+                return;
+            }
+            /* Generate String which has a readable representation of the callStack */
+            char CallStack[STACK_BUFFER_SIZE] = "";
+            if ( Mortrall::r->callStack->stackDepth == 0 )
+            {
+                Mortrall::strfcat(CallStack, "Stack %d is empty" EOL, Mortrall::tid);
+                if(Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth])
+                {
+                    Mortrall::strfcat(CallStack, "Stack %d: %08x" EOL, Mortrall::r->callStack->stackDepth, Mortrall::r->callStack->stack[Mortrall::r->callStack->stackDepth]);
+                }
+                strcat(CallStack, EOL);
+            }
+            else
+            {
+                Mortrall::strfcat(CallStack,  "Stack depth is %d with tid: %d" EOL, Mortrall::r->callStack->stackDepth, Mortrall::tid);
+                for ( int i = 0; i < Mortrall::r->callStack->stackDepth+1; i++ )
+                {
+                    struct symbolFunctionStore *running_func = symbolFunctionAt( Mortrall::r->s, Mortrall::r->callStack->stack[i] );
+                    if(running_func)
+                    {
+                        Mortrall::strfcat(CallStack,  "Stack %d: %08x %s" EOL, i, Mortrall::r->callStack->stack[i], running_func->funcname);
+                    }
+                    else
+                    {
+                        Mortrall::strfcat(CallStack,  "Stack %d: %08x" EOL, i, Mortrall::r->callStack->stack[i]);
+                    }
+                }
+                strcat(CallStack, EOL);
+            }
+            /* Relate the reporting to cycle count which can be set by an argument to get debug information for certain parts of the trace */
+            if(Mortrall::r->i.cpu.cycleCount >= cycleCountThreshold)
+            {
+                _traceReport( V_DEBUG, CallStack);
+            }
+        }
+        static void inline strfcat(char *str, const char *format, ...)
+        {
+            char buffer[STACK_BUFFER_SIZE];
+            va_list args;
+            va_start(args, format);
+            vsnprintf(buffer,STACK_BUFFER_SIZE, format, args);
+            va_end(args);
+            strcat(str, buffer);
+        }
+
+//--------------------------------------------------------------------------------------//
+//---------------------------------- END REGION Report ---------------------------------//
+//--------------------------------------------------------------------------------------//
+
+//--------------------------------------------------------------------------------------//
+//------------------------------------- BEGIN song -------------------------------------//
+//--------------------------------------------------------------------------------------//
+
+        /*  This part is just for fun because I was stuck on with instruction tracing and Niklas was on vacation */
+        /*  and sadly no one else at Auterion can help me with that (GPT: why should they, no one has ever done that )*/
+        static void inline _startSong()
+        {
+            /* Path to your audio file */
+            const char *songPath = "/Users/lukasvonbriel/Music/Music/Media.localized/Music/Unknown Artist/Unknown Album/glass-of-wine-143532.mp3";
+
+            /* Command to play the song using afplay */
+            char command[512];
+            snprintf(command, sizeof(command), "osascript -e 'tell application \"Terminal\" to do script \"afplay \\\"%s\\\"\"'", songPath);
+
+            /* Play the song */
+            system(command);
+        }
+
+        static void inline _endSong()
+        {
+            system("pkill afplay");
+        }
+
+//--------------------------------------------------------------------------------------//
+//------------------------------------- END song -------------------------------------//
+//--------------------------------------------------------------------------------------//
+
+};
\ No newline at end of file
diff --git a/ext/orbetto/src/orbetto.cpp b/ext/orbetto/src/orbetto.cpp
index 6a22e19..537a143 100644
--- a/ext/orbetto/src/orbetto.cpp
+++ b/ext/orbetto/src/orbetto.cpp
@@ -30,6 +30,7 @@ using namespace std::string_literals;
 #include "stream.h"
 #include "loadelf.h"
 #include "device.hpp"
+#include "mortrall.hpp"
 
 // To get the ITM channel list
 #include "../../src/emdbg/patch/data/itm.h"
@@ -45,9 +46,13 @@ struct
     bool useTPIU{false};
     uint32_t tpiuChannel{1};
     uint64_t cps{0};
+    uint64_t dbg_cc{0};
     std::string file;
     std::string elfFile;
+    std::string elfBootloaderFile;
     bool outputDebugFile;
+    enum verbLevel verbose;
+    bool etm{false};
 } options;
 
 struct
@@ -60,13 +65,17 @@ struct
     struct TPIUPacket p;
     uint64_t timeStamp;                  /* Latest received time */
     uint64_t ns;                         /* Latest received time in ns */
-    struct symbol *symbols;              /* symbols from the elf file */
+    struct symbol *symbols;              /* active symbols */
+    struct symbol *symbols_main;         /* symbols from the main elf file */
+    struct symbol *symbols_bootloader;   /* symbols from the bootloader elf file */
 } _r;
 
 static Device device;
 
 static perfetto::protos::Trace *perfetto_trace;
 static perfetto::protos::FtraceEventBundle *ftrace;
+Mortrall mortrall;
+
 // ====================================================================================================
 
 static constexpr uint32_t PID_TSK{0};
@@ -275,6 +284,16 @@ static void _handleSW( struct swMsg *m, struct ITMDecoder *i )
             const uint8_t priority = m->value >> 16;
             const uint8_t prev_state = m->value >> 24;
             _switchTo(tid, true, priority, prev_state);
+            if(prev_tid < PID_STOP && prev_tid != 0)
+            {
+                auto *event = ftrace->add_event();
+                event->set_timestamp(ns);
+                event->set_pid(PID_TSK + prev_tid);
+                auto *print = event->mutable_print();
+                char buffer[100];
+                snprintf(buffer, sizeof(buffer), "C|%u|Priorities %s|%u",PID_TSK,thread_names[prev_tid].c_str() ,priority);
+                print->set_buf(buffer);
+            }
             break;
         }
         case EMDBG_TASK_RUNNABLE: // ready
@@ -310,7 +329,7 @@ static void _handleSW( struct swMsg *m, struct ITMDecoder *i )
                 {
                     if (const char *name = (const char *) symbolCodeAt(_r.symbols, m->value, NULL); name)
                     {
-                        printf("Found Name %s for 0x%08x\n", name, m->value);
+                        //printf("Found Name %s for 0x%08x\n", name, m->value);
                         workqueue_names[m->value] = name;
                     }
                     else {
@@ -624,12 +643,28 @@ static void _handleSW( struct swMsg *m, struct ITMDecoder *i )
             }
             break;
         }
+        default:
+        {
+            // debug print the unknown source address
+            genericsReport( V_DEBUG, "Unknown SW message from %u\n", m->srcAddr );
+            break;
+        }
     }
 }
 // ====================================================================================================
 static void _handleTS( struct TSMsg *m, struct ITMDecoder *i )
 {
-    _r.timeStamp += m->timeInc;
+    // only use if instruction trace data is not available
+    if( !options.etm )
+    {
+        _r.timeStamp += m->timeInc;
+        _r.ns = (_r.timeStamp * 1e9) / options.cps;
+    }
+}
+
+static void _handleTSFromETM(uint64_t cc)
+{
+    _r.timeStamp = cc;
     _r.ns = (_r.timeStamp * 1e9) / options.cps;
 }
 
@@ -759,6 +794,11 @@ static void _itmPumpProcessPre( char c )
                     stopped_threads.insert(m->value);
                     // printf("Thread %u stopped\n", m->value);
                 }
+                // List all thread switches to construct call graph from instruction (etm) data later
+                if(m->srcAddr== EMDBG_TASK_RESUME)
+                {
+                    mortrall.add_thread_switch(m->value & 0xfffful);
+                }
             }
             else if (p.genericMsg.msgtype == MSG_PC_SAMPLE)
             {
@@ -806,6 +846,7 @@ static void _itmPumpProcess( char c )
 
         if ( h[pp->genericMsg.msgtype] )
         {
+
             ( h[pp->genericMsg.msgtype] )( pp, &_r.i );
         }
     }
@@ -817,11 +858,11 @@ static void _itmPumpProcess( char c )
 // ====================================================================================================
 // ====================================================================================================
 // ====================================================================================================
-static void _protocolPump( uint8_t c )
+static void _protocolPump( uint8_t c ,void ( *_pumpITMProcessGeneric )( char ),void ( *_pumpETMProcessGeneric )( char ))
 {
     if ( options.useTPIU )
     {
-        switch ( TPIUPump( &_r.t, c ) )
+        switch ( TPIUPump( &_r.t, c) )
         {
             case TPIU_EV_NEWSYNC:
             case TPIU_EV_SYNCED:
@@ -844,15 +885,23 @@ static void _protocolPump( uint8_t c )
 
                 for ( uint32_t g = 0; g < _r.p.len; g++ )
                 {
-                    if ( _r.p.packet[g].s == options.tpiuChannel )
+                    if  ( _r.p.packet[g].s == 2 )
                     {
-                        _itmPumpProcess( _r.p.packet[g].d );
+                        if ( _pumpETMProcessGeneric )
+                        {
+                            _pumpETMProcessGeneric( _r.p.packet[g].d );
+                        }else{
+                            options.etm = true;
+                        }
                         continue;
                     }
-
-                    if  ( _r.p.packet[g].s != 0 )
+                    else if ( _r.p.packet[g].s == 1 )
                     {
-                        genericsReport( V_DEBUG, "Unknown TPIU channel %02x" EOL, _r.p.packet[g].s );
+                        _pumpITMProcessGeneric( (char)_r.p.packet[g].d );
+                    }
+                    else
+                    {
+                        printf("Unknown TPIU channel %02x" EOL, _r.p.packet[g].s );
                     }
                 }
 
@@ -861,22 +910,34 @@ static void _protocolPump( uint8_t c )
             case TPIU_EV_ERROR:
                 genericsReport( V_WARN, "****ERROR****" EOL );
                 break;
+            default:
+                break;
         }
     }
     else
     {
-        _itmPumpProcess( c );
+        _pumpITMProcessGeneric( c );
     }
 }
+
+// ====================================================================================================
+
+static void _switchSymbols()
+{
+    _r.symbols = _r.symbols_main;
+}
+
 // ====================================================================================================
 static struct option _longOptions[] =
 {
+    {"cc", required_argument, NULL, 'a'},
     {"cpufreq", required_argument, NULL, 'C'},
     {"input-file", required_argument, NULL, 'f'},
     {"help", no_argument, NULL, 'h'},
     {"tpiu", required_argument, NULL, 't'},
     {"elf", required_argument, NULL, 'e'},
-    {"debug", no_argument, NULL, 'd'},
+    {"elf_bootloader", required_argument, NULL, 'b'},
+    {"debug", required_argument, NULL, 'd'},
     {"verbose", required_argument, NULL, 'v'},
     {"version", no_argument, NULL, 'V'},
     {NULL, no_argument, NULL, 0}
@@ -884,7 +945,7 @@ static struct option _longOptions[] =
 bool _processOptions( int argc, char *argv[] )
 {
     int c, optionIndex = 0;
-    while ( ( c = getopt_long ( argc, argv, "C:Ef:de:hVt:v:", _longOptions, &optionIndex ) ) != -1 )
+    while ( ( c = getopt_long ( argc, argv, "a:b:C:Ef:de:hVt:v:", _longOptions, &optionIndex ) ) != -1 )
         switch ( c )
         {
             // ------------------------------------
@@ -895,14 +956,15 @@ bool _processOptions( int argc, char *argv[] )
             // ------------------------------------
             case 'h':
                 fprintf( stdout, "Usage: %s [options]" EOL, argv[0] );
-                fprintf( stdout, "    -C, --cpufreq:      <Frequency in KHz> (Scaled) speed of the CPU" EOL );
-                fprintf( stdout, "    -f, --input-file:   <filename> Take input from specified file" EOL );
-                fprintf( stdout, "    -h, --help:         This help" EOL );
-                fprintf( stdout, "    -e, --elf:          <file>: Use this ELF file for information" EOL );
-                fprintf( stdout, "    -d, --debug:        Output a human-readable protobuf file" EOL );
-                fprintf( stdout, "    -t, --tpiu:         <channel>: Use TPIU decoder on specified channel (normally 1)" EOL );
-                fprintf( stdout, "    -v, --verbose:      <level> Verbose mode 0(errors)..3(debug)" EOL );
-                fprintf( stdout, "    -V, --version:      Print version and exit" EOL );
+                fprintf( stdout, "    -C, --cpufreq:            <Frequency in KHz> (Scaled) speed of the CPU" EOL );
+                fprintf( stdout, "    -f, --input-file:         <filename> Take input from specified file" EOL );
+                fprintf( stdout, "    -h, --help:               This help" EOL );
+                fprintf( stdout, "    -e, --elf:                <file>: Use this ELF file for information" EOL );
+                fprintf( stdout, "    -eb, --elf_bootloader:    <file>: Use this ELF file for bootloader information" EOL );
+                fprintf( stdout, "    -d, --debug:              Output a human-readable protobuf file" EOL );
+                fprintf( stdout, "    -t, --tpiu:               <channel>: Use TPIU decoder on specified channel (normally 1)" EOL );
+                fprintf( stdout, "    -v, --verbose:            <level> Verbose mode 0(errors)..3(debug)" EOL );
+                fprintf( stdout, "    -V, --version:            Print version and exit" EOL );
                 return false;
 
             // ------------------------------------
@@ -915,6 +977,10 @@ bool _processOptions( int argc, char *argv[] )
                 options.outputDebugFile = true;
                 break;
 
+            case 'a':
+                options.dbg_cc = atoi( optarg );
+                break;
+
             // ------------------------------------
             case 'f':
                 options.file = optarg;
@@ -925,6 +991,10 @@ bool _processOptions( int argc, char *argv[] )
                 options.elfFile = optarg;
                 break;
 
+            case 'b':
+                options.elfBootloaderFile = optarg;
+                break;
+
             // ------------------------------------
             case 't':
                 options.useTPIU = true;
@@ -940,6 +1010,7 @@ bool _processOptions( int argc, char *argv[] )
                 }
 
                 genericsSetReportLevel( (enum verbLevel)atoi( optarg ) );
+                options.verbose = (enum verbLevel) atoi( optarg );
                 break;
 
             // ------------------------------------
@@ -1012,6 +1083,7 @@ int main(int argc, char *argv[])
     ftrace->set_cpu(0);
 
     struct Stream *stream = streamCreateFile( options.file.c_str() );
+    genericsReport( V_INFO, "PreProcess Stream" EOL );
     while ( true )
     {
         size_t receivedSize;
@@ -1024,23 +1096,42 @@ int main(int argc, char *argv[])
         if (result == RECEIVE_RESULT_EOF or result == RECEIVE_RESULT_ERROR) break;
 
         unsigned char *c = cbw;
-        while (receivedSize--) _itmPumpProcessPre(*c++);
+        while (receivedSize--) _protocolPump(*c++,_itmPumpProcessPre,NULL); //_itmPumpProcessPre(*c++);
         fflush(stdout);
     }
     stream->close(stream);
     free(stream);
 
     printf("Loading ELF file %s with%s source lines\n", options.elfFile.c_str(), has_pc_samples ? "" : "out");
-    _r.symbols = symbolAcquire((char*)options.elfFile.c_str(), true, has_pc_samples);
-    assert( _r.symbols );
-    printf("Loaded ELF with %u sections:\n", _r.symbols->nsect_mem);
-    for (int ii = 0; ii < _r.symbols->nsect_mem; ii++)
+    _r.symbols_main = symbolAcquire((char*)options.elfFile.c_str(), true, has_pc_samples);
+    assert( _r.symbols_main );
+    printf("Loaded ELF with %u sections:\n", _r.symbols_main->nsect_mem);
+    for (int ii = 0; ii < _r.symbols_main->nsect_mem; ii++)
     {
-        auto mem = _r.symbols->mem[ii];
+        auto mem = _r.symbols_main->mem[ii];
         printf("  Section '%s': [0x%08lx, 0x%08lx] (%lu)\n", mem.name, mem.start, mem.start + mem.len, mem.len);
     }
+    if(options.elfBootloaderFile.size())
+    {
+        printf("Loading ELF file %s for bootloader with%s source lines\n", options.elfBootloaderFile.c_str(), has_pc_samples ? "" : "out");
+        _r.symbols_bootloader = symbolAcquire((char*)options.elfBootloaderFile.c_str(), true, has_pc_samples);
+        assert( _r.symbols_bootloader );
+        printf("Loaded ELF with %u sections:\n", _r.symbols_bootloader->nsect_mem);
+        for (int ii = 0; ii < _r.symbols_bootloader->nsect_mem; ii++)
+        {
+            auto mem = _r.symbols_bootloader->mem[ii];
+            printf("  Section '%s': [0x%08lx, 0x%08lx] (%lu)\n", mem.name, mem.start, mem.start + mem.len, mem.len);
+        }
+        _r.symbols = _r.symbols_bootloader;
+    }else{
+        _r.symbols = _r.symbols_main;
+    }
+
+    printf("Initialize Mortrall (Instruction tracing)\n");
+    mortrall.init(perfetto_trace,ftrace,options.cps, options.verbose,_r.symbols_main,_r.symbols_bootloader,_handleTSFromETM,_switchSymbols,options.dbg_cc);
 
     stream = streamCreateFile( options.file.c_str() );
+    genericsReport( V_INFO, "Process Stream" EOL );
     while ( true )
     {
         size_t receivedSize;
@@ -1053,7 +1144,7 @@ int main(int argc, char *argv[])
         if (result == RECEIVE_RESULT_EOF or result == RECEIVE_RESULT_ERROR) break;
 
         unsigned char *c = cbw;
-        while (receivedSize--) _protocolPump(*c++);
+        while (receivedSize--) _protocolPump(*c++,_itmPumpProcess,mortrall.dumpElement);
         fflush(stdout);
     }
     stream->close(stream);
@@ -1095,6 +1186,9 @@ int main(int argc, char *argv[])
                 thread->set_tid(tid);
                 thread->set_tgid(PID_TSK);
             }
+            auto *thread = process_tree->add_threads();
+            thread->set_tid(100);
+            thread->set_tgid(PID_TSK);
         }
         {
             auto *process = process_tree->add_processes();
@@ -1181,6 +1275,7 @@ int main(int argc, char *argv[])
             //     thread->set_name(buffer);
             // }
         }
+        mortrall.finalize(process_tree);
     }
 
 
diff --git a/ext/orbetto/subprojects/croaring.wrap b/ext/orbetto/subprojects/croaring.wrap
new file mode 100644
index 0000000..8ae8682
--- /dev/null
+++ b/ext/orbetto/subprojects/croaring.wrap
@@ -0,0 +1,4 @@
+[wrap-git]
+directory = CRoaring
+url = https://github.com/RoaringBitmap/CRoaring.git
+revision = ad487efbb48637445e6a5eee10c81cc9fdd7bf76
\ No newline at end of file
diff --git a/ext/orbetto/subprojects/orbuculum.wrap b/ext/orbetto/subprojects/orbuculum.wrap
index 364798a..81b4938 100644
--- a/ext/orbetto/subprojects/orbuculum.wrap
+++ b/ext/orbetto/subprojects/orbuculum.wrap
@@ -1,3 +1,3 @@
 [wrap-git]
 url = https://github.com/niklaut/orbuculum.git
-revision = 81b64da
+revision = Mortrall
diff --git a/src/emdbg/bench/data/fmu_v5x_orbtrace.cfg b/src/emdbg/bench/data/fmu_v5x_orbtrace.cfg
index 7a0b694..4b1cb4f 100644
--- a/src/emdbg/bench/data/fmu_v5x_orbtrace.cfg
+++ b/src/emdbg/bench/data/fmu_v5x_orbtrace.cfg
@@ -1,5 +1,5 @@
 source [find interface/cmsis-dap.cfg]
-cmsis-dap vid_pid 0x1209 0x3443
+# cmsis-dap vid_pid 0x1209 0x3443
 transport select swd
 
 source [find target/stm32f7x.cfg]
diff --git a/src/emdbg/bench/data/fmu_v6x_orbtrace.cfg b/src/emdbg/bench/data/fmu_v6x_orbtrace.cfg
index f6c3bc0..678c0e5 100644
--- a/src/emdbg/bench/data/fmu_v6x_orbtrace.cfg
+++ b/src/emdbg/bench/data/fmu_v6x_orbtrace.cfg
@@ -1,5 +1,5 @@
 source [find interface/cmsis-dap.cfg]
-cmsis-dap vid_pid 0x1209 0x3443
+#cmsis-dap vid_pid 0x1209 0x3443
 transport select swd
 
 source [find target/stm32h7x_dual_bank.cfg]
diff --git a/src/emdbg/debug/data/cortex_m.gdb b/src/emdbg/debug/data/cortex_m.gdb
index 7a7a732..75a4d4f 100644
--- a/src/emdbg/debug/data/cortex_m.gdb
+++ b/src/emdbg/debug/data/cortex_m.gdb
@@ -196,12 +196,29 @@ define px4_enable_trace_stm32h7
     set *($TPIUBASE+0x304) = 0x00000102
 end
 
+define px4_etm_trace_tpiu_swo_stm32f7
+    px4_reset
+
+    enableSTM32TRACE 4 3 1
+
+    px4_configure_orbuculum
+
+    startETM
+
+    # -o trace.swo dumps the RAW data, not the demuxed data!!!
+    shell killall orbuculum
+    shell orbuculum -O "-T4" &
+    shell sleep 1
+    shell nc localhost 3443 > trace.swo &
+end
+
+
 define px4_trace_tpiu_swo_stm32f7
     px4_reset
     tbreak nx_start
     continue
 
-    enableSTM32TRACE 4 3
+    enableSTM32TRACE 4 3 1
 
     px4_configure_orbuculum
 
@@ -212,6 +229,21 @@ define px4_trace_tpiu_swo_stm32f7
     shell nc localhost 3443 > trace.swo &
 end
 
+define px4_etm_trace_tpiu_swo_stm32h7
+    px4_reset
+
+    px4_enable_trace_stm32h7 4
+
+    px4_configure_orbuculum
+
+    startETM
+
+    # -o trace.swo dumps the RAW data, not the demuxed data!!!
+    shell killall orbuculum
+    shell orbuculum -O "-T4" &
+    shell sleep 1
+    shell nc localhost 3443 > trace.swo &
+end
 
 define px4_trace_tpiu_swo_stm32h7
     px4_reset
diff --git a/src/emdbg/debug/data/orbuculum.gdb b/src/emdbg/debug/data/orbuculum.gdb
index 6c1823b..f586a4c 100644
--- a/src/emdbg/debug/data/orbuculum.gdb
+++ b/src/emdbg/debug/data/orbuculum.gdb
@@ -88,6 +88,7 @@ set $ITMBASE=0xE0000000
 set $TPIUBASE=0xE0040000
 set $ETMBASE=0xE0041000
 
+
 define _setAddressesSTM32
   # Locations in the memory map for interesting things on STM32
   set $CPU = $CPU_STM32
@@ -165,6 +166,7 @@ set $TRCVDARCCTLR=$ETM4BASE+0x0a8
 set $TRCDEVARCH=$ETM4BASE+0xFBC
 # Trace IDR base address
 set $TRCIDR0=$ETM4BASE+0x1E0
+set $TRCIDR3=$ETM4BASE+0x1EC
 end
 # Trace Address comparator Value registers
 set $TRCACVR0=$ETMBASE+0x400
@@ -257,6 +259,59 @@ define _startETMv4
   end
 end
 # ====================================================================
+define _startETMv4_modified
+  echo ETMv4 Tracing is enabled\n
+  # Enable the return stack, global timestamping, Context ID, and Virtual context identifier tracing.
+  # Disabled global timestamp to not interfere with cycle count
+  set *($TRCCONFIGR) = 0x000010C1
+
+  # Disable all event tracing.
+  set *($TRCEVENTCTL0R) = 0
+  set *($TRCEVENTCTL1R) = 0
+
+  # Stalling is not possible on h7 and f7
+  # Disable or enable stalling for instructions, if implemented
+  set *($TRCSTALLCTLR) = (1<<13)|(1<<8)|(0x0f<<0)
+
+  # Fixed on STM32f7
+  # Trace sync every 1024 bytes of trace
+  set *($TRCSYNCPR) = 0x0c
+
+  # Do we want branch broadcasting?
+  set *$TRCACVR0=0
+  set *$TRCACVR1=0xFFFFFFFF
+  set *$TRCACATR0=0
+  set *$TRCACATR1=0
+  set *($TRCCONFIGR) |= ($br_out<<3)
+  set *($TRCBBCTLR) = ($br_out<<8)|0x03
+
+  # enable cycle count
+  set *($TRCCONFIGR) |= 0x10
+  set *($TRCCCCTLR) |= 0x10
+
+  # Trace on ID 2
+  set *($TRCTRACEIDR) = 2
+
+  # Disable timestamp event (This does not do anything)
+  set *($TRCTSCTLR) = 0
+
+  # Enable ViewInst to trace everything
+  set *($TRCVICTLR) = 0x201
+
+  # No address range filtering for ViewInst
+  set *($TRCVIIECTLR) = 0
+
+  # No start or stop points for ViewInst
+  set *($TRCVISSCTLR) = 0
+
+  # ...and start
+  set *($TRCPRGCTLR) |= (1<<0)
+
+  while (((*$TRCSTATR)&(1<<0))==1)
+  echo Wait for trace not idle\n
+  end
+end
+# ====================================================================
 define _startETMv35
 
   # Allow access to device
@@ -329,8 +384,10 @@ define startETM
 
 
   if (((*$TRCDEVARCH)&0xfff0ffff)  ==0x47704a13)
-    _startETMv4
+    echo ETMv4 version active \n
+    _startETMv4_modified
   else
+    echo ETMv3 version active \n
     _startETMv35
   end
 
@@ -891,6 +948,7 @@ define enableSTM32TRACE
 
   set $bits=4
   set $drive=1
+  set $remap=0
 
   if $argc >= 1
     set $bits = $arg0
@@ -907,6 +965,10 @@ define enableSTM32TRACE
     help enableSTM32TRACE
   end
 
+  if $argc >= 3
+    set $remap = $arg2
+  end
+
   set $bits = $bits-1
   set $CPU=$CPU_STM32
 
@@ -923,15 +985,20 @@ define enableSTM32TRACE
   enableSTM32Pin 4 2 $drive
   enableSTM32Pin 4 3 $drive
 
-  if ($bits>0)
+  if ($bits>=1)
      # Setup PE4
      enableSTM32Pin 4 4 $drive
   end
 
-  if ($bits>1)
-     # Setup PE5 & PC12
-     enableSTM32Pin 4 5 $drive
-     enableSTM32Pin 2 12 $drive
+  if ($bits>=2)
+    # Setup PE5 & PC12 
+    enableSTM32Pin 4 5 $drive
+    #enableSTM32Pin 4 6 $drive
+    if ($remap)
+      enableSTM32Pin 2 12 $drive 
+    else
+      enableSTM32Pin 4 6 $drive
+    end
   end
 
   # Set number of bits in DBGMCU_CR