Skip to content

Commit

Permalink
[stormond] Added new dynamic field 'last_sync_time' to STATE_DB (#535)
Browse files Browse the repository at this point in the history
* Added new dynamic field 'last_sync_time' that shows when STORAGE_INFO for disk was last synced to STATE_DB

* Moved 'start' message to actual starting point of the daemon

* Added functions for formatted and epoch time for user friendly time display

* Made changes per prgeor review comments

* Pivot to SysLogger for all logging

* Increased log level so that they are seen in syslogs

* Code coverage improvement
  • Loading branch information
assrinivasan authored Nov 27, 2024
1 parent 0431fa3 commit 3624cb7
Show file tree
Hide file tree
Showing 2 changed files with 265 additions and 62 deletions.
77 changes: 45 additions & 32 deletions sonic-stormond/scripts/stormond
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import shutil
import json
import time

from datetime import datetime
from sonic_py_common import daemon_base, device_info, syslogger
from swsscommon import swsscommon
from sonic_platform_base.sonic_storage.storage_devices import StorageDevices, BLKDEV_BASE_PATH
Expand Down Expand Up @@ -49,6 +50,8 @@ class DaemonStorage(daemon_base.DaemonBase):
self.log = syslogger.SysLogger(SYSLOG_IDENTIFIER)
super(DaemonStorage, self).__init__(log_identifier)

self.log.log_notice("Starting Storage Monitoring Daemon")

self.timeout = STORMOND_PERIODIC_STATEDB_SYNC_SECS
self.fsstats_sync_interval = STORMOND_SYNC_TO_DISK_SECS
self.stop_event = threading.Event()
Expand All @@ -68,6 +71,9 @@ class DaemonStorage(daemon_base.DaemonBase):
self.fsio_rw_json = {disk:{} for disk in self.storage.devices}
self.fsio_rw_statedb = {disk:{} for disk in self.storage.devices}

# This is the time format string
self.time_format_string = "%Y-%m-%d %H:%M:%S"

# This time is set at init and then subsequently after each FSIO JSON file sync
self.fsio_sync_time = time.time()

Expand All @@ -82,7 +88,8 @@ class DaemonStorage(daemon_base.DaemonBase):
"total_fsio_writes", \
"disk_io_reads", \
"disk_io_writes", \
"reserved_blocks"]
"reserved_blocks", \
"last_sync_time"]

# These are the fields that we are interested in saving to disk to protect against
# reboots or crashes
Expand All @@ -97,21 +104,26 @@ class DaemonStorage(daemon_base.DaemonBase):
self._load_fsio_rw_json()
self._determine_sot()

# This function is used to convert the epoch time to a user friendly formatted string
def get_formatted_time(self, time_since_epoch):
return datetime.fromtimestamp(time_since_epoch).strftime(self.time_format_string)

# This function is used to configure the polling and sync intervals for the daemon
def get_configdb_intervals(self):
self.config_db = daemon_base.db_connect("CONFIG_DB")
config_info = dict(self.config_db.hgetall('STORMOND_CONFIG|INTERVALS'))
self.timeout = int(config_info.get('daemon_polling_interval', STORMOND_PERIODIC_STATEDB_SYNC_SECS))
self.fsstats_sync_interval = int(config_info.get('fsstats_sync_interval', STORMOND_SYNC_TO_DISK_SECS))

self.log_info("Polling Interval set to {} seconds".format(self.timeout))
self.log_info("FSIO JSON file Interval set to {} seconds".format(self.fsstats_sync_interval))
self.log.log_notice("Polling Interval set to {} seconds".format(self.timeout))
self.log.log_notice("FSIO JSON file Interval set to {} seconds".format(self.fsstats_sync_interval))


# Get the total and latest FSIO reads and writes from JSON file
def _load_fsio_rw_json(self):
try:
if not os.path.exists(FSIO_RW_JSON_FILE):
self.log_info("{} not present.".format(FSIO_RW_JSON_FILE))
self.log.log_notice("{} not present.".format(FSIO_RW_JSON_FILE))
return

# Load JSON file
Expand All @@ -123,21 +135,21 @@ class DaemonStorage(daemon_base.DaemonBase):
for field in self.statedb_json_sync_fields:

if self.fsio_rw_json[storage_device][field] == None:
self.log_warning("{}:{} value = None in JSON file".format(storage_device, field))
self.log.log_warning("{}:{} value = None in JSON file".format(storage_device, field))
return

self.fsio_json_file_loaded = True

except Exception as e:
self.log_error("JSON file could not be loaded: {}".format(str(e)))
self.log.log_error("JSON file could not be loaded: {}".format(str(e)))

return


# Sync the total and latest procfs reads and writes from STATE_DB to JSON file on disk
def sync_fsio_rw_json(self):

self.log_info("Syncing total and latest procfs reads and writes from STATE_DB to JSON file")
self.log.log_notice("Syncing total and latest procfs reads and writes from STATE_DB to JSON file")

json_file_dict = {disk:{} for disk in self.storage.devices}
try:
Expand All @@ -146,21 +158,21 @@ class DaemonStorage(daemon_base.DaemonBase):
json_file_dict[device][field] = self.state_db.hget('STORAGE_INFO|{}'.format(device), field)

self.fsio_sync_time = time.time()
json_file_dict["successful_sync_time"] = str(self.fsio_sync_time)
json_file_dict["successful_sync_time"] = str(self.get_formatted_time(self.fsio_sync_time))

with open(FSIO_RW_JSON_FILE, 'w+') as f:
json.dump(json_file_dict, f)

return True

except Exception as ex:
self.log_error("Unable to sync state_db to disk: {}".format(str(ex)))
self.log.log_error("Unable to sync state_db to disk: {}".format(str(ex)))
return False


# Update the successful sync time to STATE_DB
def write_sync_time_statedb(self):
self.state_db.hset("{}|{}".format(STORAGE_DEVICE_TABLE,FSSTATS_SYNC_TIME_KEY), "successful_sync_time", str(self.fsio_sync_time))
self.state_db.hset("{}|{}".format(STORAGE_DEVICE_TABLE,FSSTATS_SYNC_TIME_KEY), "successful_sync_time", str(self.get_formatted_time(self.fsio_sync_time)))

# Run a sanity check on the state_db. If successful, get total, latest
# FSIO reads and writes for each storage device from STATE_DB
Expand All @@ -185,12 +197,12 @@ class DaemonStorage(daemon_base.DaemonBase):
self.fsio_rw_statedb[storage_device][field] = "0" if value is None else value

if value is None:
self.log_warning("{}:{} value = None in StateDB".format(storage_device, field))
self.log.log_warning("{}:{} value = None in StateDB".format(storage_device, field))
return

self.statedb_storage_info_loaded = True
except Exception as e:
self.log_error("Reading STATE_DB failed with: {}".format(str(e)))
self.log.log_error("Reading STATE_DB failed with: {}".format(str(e)))


def _determine_sot(self):
Expand Down Expand Up @@ -269,21 +281,21 @@ class DaemonStorage(daemon_base.DaemonBase):
try:
# Unlikely scenario
if storage_object is None:
self.log_info("{} does not have an instantiated object. Static Information cannot be gathered.".format(storage_device))
self.log.log_notice("{} does not have an instantiated object. Static Information cannot be gathered.".format(storage_device))
continue

static_kvp_dict = {}

static_kvp_dict["device_model"] = storage_object.get_model()
static_kvp_dict["serial"] = storage_object.get_serial()

self.log_info("Storage Device: {}, Device Model: {}, Serial: {}".format(storage_device, static_kvp_dict["device_model"], static_kvp_dict["serial"]))
self.log.log_notice("Storage Device: {}, Device Model: {}, Serial: {}".format(storage_device, static_kvp_dict["device_model"], static_kvp_dict["serial"]))

# update Storage Device Status to DB
self.update_storage_info_status_db(storage_device, static_kvp_dict)

except Exception as ex:
self.log_error("get_static_fields_update_state_db() failed with: {}".format(str(ex)))
self.log.log_error("get_static_fields_update_state_db() failed with: {}".format(str(ex)))

# Get Dynamic attributes and update the State DB
def get_dynamic_fields_update_state_db(self):
Expand All @@ -292,7 +304,7 @@ class DaemonStorage(daemon_base.DaemonBase):
for storage_device, storage_object in self.storage.devices.items():
try:
if storage_object is None:
self.log_info("Storage device '{}' does not have an instantiated object. Dynamic Information cannot be gathered.".format(storage_device))
self.log.log_notice("Storage device '{}' does not have an instantiated object. Dynamic Information cannot be gathered.".format(storage_device))
continue

# Fetch the latest dynamic info
Expand All @@ -309,20 +321,23 @@ class DaemonStorage(daemon_base.DaemonBase):
dynamic_kvp_dict["disk_io_reads"] = storage_object.get_disk_io_reads()
dynamic_kvp_dict["disk_io_writes"] = storage_object.get_disk_io_writes()
dynamic_kvp_dict["reserved_blocks"] = storage_object.get_reserved_blocks()
dynamic_kvp_dict["last_sync_time"] = self.get_formatted_time(time.time())

dynamic_kvp_dict["total_fsio_reads"], dynamic_kvp_dict["total_fsio_writes"] = self._reconcile_fsio_rw_values(dynamic_kvp_dict, storage_device)

self.log_info("Storage Device: {}, Firmware: {}, health: {}%, Temp: {}C, FS IO Reads: {}, FS IO Writes: {}".format(\
storage_device, dynamic_kvp_dict["firmware"], dynamic_kvp_dict["health"], dynamic_kvp_dict["temperature"], dynamic_kvp_dict["total_fsio_reads"],dynamic_kvp_dict["total_fsio_writes"]))
self.log_info("Latest FSIO Reads: {}, Latest FSIO Writes: {}".format(dynamic_kvp_dict["latest_fsio_reads"], dynamic_kvp_dict["latest_fsio_writes"]))
self.log_info("Disk IO Reads: {}, Disk IO Writes: {}, Reserved Blocks: {}".format(dynamic_kvp_dict["disk_io_reads"], dynamic_kvp_dict["disk_io_writes"], \
dynamic_kvp_dict["reserved_blocks"]))

# Update storage device statistics to STATE_DB
self.update_storage_info_status_db(storage_device, dynamic_kvp_dict)

# Log to syslog
self.log.log_notice("Storage Device: {}, Firmware: {}, health: {}%, Temp: {}C, FS IO Reads: {}, FS IO Writes: {}".format(\
storage_device, dynamic_kvp_dict["firmware"], dynamic_kvp_dict["health"], dynamic_kvp_dict["temperature"], dynamic_kvp_dict["total_fsio_reads"],dynamic_kvp_dict["total_fsio_writes"]))
self.log.log_notice("Latest FSIO Reads: {}, Latest FSIO Writes: {}".format(dynamic_kvp_dict["latest_fsio_reads"], dynamic_kvp_dict["latest_fsio_writes"]))
self.log.log_notice("Disk IO Reads: {}, Disk IO Writes: {}, Reserved Blocks: {}".format(dynamic_kvp_dict["disk_io_reads"], dynamic_kvp_dict["disk_io_writes"], \
dynamic_kvp_dict["reserved_blocks"]))
self.log.log_notice("Last successful sync time to STATE_DB: {}".format(dynamic_kvp_dict["last_sync_time"]))

except Exception as ex:
self.log_info("get_dynamic_fields_update_state_db() failed with: {}".format(str(ex)))
self.log.log_notice("get_dynamic_fields_update_state_db() failed with: {}".format(str(ex)))


# Override signal handler from DaemonBase
Expand All @@ -333,22 +348,22 @@ class DaemonStorage(daemon_base.DaemonBase):
global exit_code

if sig in FATAL_SIGNALS:
self.log_info("Caught signal '{}'".format(signal.Signals(sig).name))
self.log.log_notice("Caught signal '{}'".format(signal.Signals(sig).name))

if self.sync_fsio_rw_json():
self.write_sync_time_statedb()
else:
self.log_warning("Unable to sync latest and total procfs RW to disk")
self.log.log_warning("Unable to sync latest and total procfs RW to disk")

self.log_info("Exiting with {}".format(signal.Signals(sig).name))
self.log.log_notice("Exiting with {}".format(signal.Signals(sig).name))

# Make sure we exit with a non-zero code so that supervisor will try to restart us
exit_code = 128 + sig
self.stop_event.set()
elif sig in NONFATAL_SIGNALS:
self.log_info("Caught signal '{}' - ignoring...".format(signal.Signals(sig).name))
self.log.log_notice("Caught signal '{}' - ignoring...".format(signal.Signals(sig).name))
else:
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(signal.Signals(sig).name))
self.log.log_warning("Caught unhandled signal '{}' - ignoring...".format(signal.Signals(sig).name))

# Main daemon logic
def run(self):
Expand All @@ -374,7 +389,7 @@ class DaemonStorage(daemon_base.DaemonBase):
if self.sync_fsio_rw_json():
self.write_sync_time_statedb()
else:
self.log_warning("Unable to sync latest and total procfs RW to disk")
self.log.log_warning("Unable to sync latest and total procfs RW to disk")

return True
#
Expand All @@ -385,15 +400,13 @@ class DaemonStorage(daemon_base.DaemonBase):
def main():
stormon = DaemonStorage(SYSLOG_IDENTIFIER)

stormon.log_info("Starting Storage Monitoring Daemon")

# Read and update Static Fields to the StateDB once
stormon.get_static_fields_update_state_db()

while stormon.run():
pass

stormon.log_info("Shutting down Storage Monitoring Daemon")
stormon.log.log_notice("Shutting down Storage Monitoring Daemon")

return exit_code

Expand Down
Loading

0 comments on commit 3624cb7

Please sign in to comment.