diff --git a/CHANGELOG.md b/CHANGELOG.md index 34bc9ce..65b475a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [0.3.0] - 2024-10-27 + +- Added rate limiting for expensive checks with the `run: "N times per day/hour"` option +- Added a cache mechanism to store check results and error states, which allows for rate limiting and avoiding redundant runs when checks fail +- Added automatic cache key expiration +- Added error handling and feedback for rate-limited checks + ## [0.2.0] - 2024-10-26 - Improved the `allgood` DSL by adding optional conditionals on when individual checks are run diff --git a/README.md b/README.md index 9d6157b..4f2b063 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,26 @@ check "Complex check", end ``` +### Rate Limiting Expensive Checks + +For expensive operations (like testing paid APIs), you can limit how often checks run: + +```ruby +# Run expensive checks a limited number of times +check "OpenAI is responding with a valid LLM message", run: "2 times per day" do + # expensive API call +end + +check "Analytics can be processed", run: "4 times per hour" do + # expensive operation +end +``` + +Important notes: +- Rate limits reset at the start of each period (hour/day) +- The error state persists between rate-limited runs +- Rate-limited checks show clear feedback about remaining runs and next reset time + When a check is skipped due to its conditions not being met, it will appear in the healthcheck page with a skip emoji (⏭️) and a clear explanation of why it was skipped. ![Example dashboard of the Allgood health check page with skipped checks](allgood_skipped.webp) diff --git a/app/controllers/allgood/healthcheck_controller.rb b/app/controllers/allgood/healthcheck_controller.rb index 787caae..5a1a622 100644 --- a/app/controllers/allgood/healthcheck_controller.rb +++ b/app/controllers/allgood/healthcheck_controller.rb @@ -44,20 +44,62 @@ def run_checks end def run_single_check(check) + last_result_key = "allgood:last_result:#{check[:name].parameterize}" + last_result = Allgood::CacheStore.instance.read(last_result_key) + + unless Allgood.configuration.should_run_check?(check) + message = check[:skip_reason] + if last_result + status_info = "Last check #{last_result[:success] ? 'passed' : 'failed'} #{time_ago_in_words(last_result[:time])} ago: #{last_result[:message]}" + message = "#{message}. #{status_info}" + end + + return { + name: check[:name], + success: last_result ? last_result[:success] : true, + skipped: true, + message: message, + duration: 0 + } + end + start_time = Time.now result = { success: false, message: "Check timed out after #{check[:timeout]} seconds" } + error_key = "allgood:error:#{check[:name].parameterize}" begin Timeout.timeout(check[:timeout]) do check_result = Allgood.configuration.run_check(&check[:block]) result = { success: check_result[:success], message: check_result[:message] } + + if result[:success] + # Clear error state and store successful result + Allgood::CacheStore.instance.write(error_key, nil) + Allgood::CacheStore.instance.write(last_result_key, { + success: true, + message: result[:message], + time: Time.current + }) + end + end + rescue Timeout::Error, Allgood::CheckFailedError, StandardError => e + error_message = case e + when Timeout::Error + "Check timed out after #{check[:timeout]} seconds" + when Allgood::CheckFailedError + e.message + else + "Error: #{e.message}" end - rescue Timeout::Error - # The result is already set to a timeout message - rescue Allgood::CheckFailedError => e - result = { success: false, message: e.message } - rescue StandardError => e - result = { success: false, message: "Error: #{e.message}" } + + # Store error state and failed result + Allgood::CacheStore.instance.write(error_key, error_message) + Allgood::CacheStore.instance.write(last_result_key, { + success: false, + message: error_message, + time: Time.current + }) + result = { success: false, message: error_message } end { diff --git a/lib/allgood.rb b/lib/allgood.rb index f2d61ae..eec13a4 100644 --- a/lib/allgood.rb +++ b/lib/allgood.rb @@ -3,6 +3,7 @@ require_relative "allgood/version" require_relative "allgood/engine" require_relative "allgood/configuration" +require_relative "allgood/cache_store" module Allgood class Error < StandardError; end diff --git a/lib/allgood/cache_store.rb b/lib/allgood/cache_store.rb new file mode 100644 index 0000000..2980c28 --- /dev/null +++ b/lib/allgood/cache_store.rb @@ -0,0 +1,52 @@ +# frozen_string_literal: true + +module Allgood + class CacheStore + def self.instance + @instance ||= new + end + + def initialize + @memory_store = {} + end + + def read(key) + if rails_cache_available? + Rails.cache.read(key) + else + @memory_store[key] + end + end + + def write(key, value) + if rails_cache_available? + expiry = key.include?('day') ? 1.day : 1.hour + Rails.cache.write(key, value, expires_in: expiry) + else + @memory_store[key] = value + end + end + + def cleanup_old_keys + return unless rails_cache_available? + + keys_pattern = "allgood:*" + if Rails.cache.respond_to?(:delete_matched) + Rails.cache.delete_matched("#{keys_pattern}:*:#{(Time.current - 2.days).strftime('%Y-%m-%d')}*") + end + rescue StandardError => e + Rails.logger.warn "Allgood: Failed to cleanup old cache keys: #{e.message}" + end + + private + + def rails_cache_available? + Rails.cache && Rails.cache.respond_to?(:read) && Rails.cache.respond_to?(:write) && + Rails.cache.write("allgood_rails_cache_test_ok", "true") && + Rails.cache.read("allgood_rails_cache_test_ok") == "true" + rescue StandardError => e + Rails.logger.warn "Allgood: Rails.cache not available (#{e.message}), falling back to memory store" + false + end + end +end diff --git a/lib/allgood/configuration.rb b/lib/allgood/configuration.rb index 9b10d6c..db43bfd 100644 --- a/lib/allgood/configuration.rb +++ b/lib/allgood/configuration.rb @@ -17,6 +17,18 @@ def check(name, **options, &block) status: :pending } + # Handle rate limiting + if options[:run] + begin + check_info[:rate] = parse_run_frequency(options[:run]) + rescue ArgumentError => e + check_info[:status] = :skipped + check_info[:skip_reason] = "Invalid run frequency: #{e.message}" + @checks << check_info + return + end + end + # Handle environment-specific options if options[:only] environments = Array(options[:only]) @@ -66,6 +78,97 @@ def check(name, **options, &block) def run_check(&block) CheckRunner.new.instance_eval(&block) end + + def should_run_check?(check) + return true unless check[:rate] + + cache_key = "allgood:last_run:#{check[:name].parameterize}" + runs_key = "allgood:runs_count:#{check[:name].parameterize}:#{current_period(check[:rate])}" + error_key = "allgood:error:#{check[:name].parameterize}" + last_result_key = "allgood:last_result:#{check[:name].parameterize}" + + last_run = Allgood::CacheStore.instance.read(cache_key) + period_runs = Allgood::CacheStore.instance.read(runs_key).to_i + last_result = Allgood::CacheStore.instance.read(last_result_key) + + current_period_key = current_period(check[:rate]) + stored_period = Allgood::CacheStore.instance.read("allgood:current_period:#{check[:name].parameterize}") + + # If we're in a new period, reset the counter + if stored_period != current_period_key + period_runs = 0 + Allgood::CacheStore.instance.write("allgood:current_period:#{check[:name].parameterize}", current_period_key) + Allgood::CacheStore.instance.write(runs_key, 0) + end + + # If there's an error, wait until next period + if previous_error = Allgood::CacheStore.instance.read(error_key) + next_period = next_period_start(check[:rate]) + rate_info = "Rate limited (#{period_runs}/#{check[:rate][:max_runs]} runs this #{check[:rate][:period]})" + check[:skip_reason] = "#{rate_info}. Waiting until #{next_period.strftime('%H:%M:%S %Z')} to retry failed check" + return false + end + + # If we haven't exceeded the max runs for this period + if period_runs < check[:rate][:max_runs] + Allgood::CacheStore.instance.write(cache_key, Time.current) + Allgood::CacheStore.instance.write(runs_key, period_runs + 1) + true + else + next_period = next_period_start(check[:rate]) + rate_info = "Rate limited (#{period_runs}/#{check[:rate][:max_runs]} runs this #{check[:rate][:period]})" + next_run = "Next check at #{next_period.strftime('%H:%M:%S %Z')}" + check[:skip_reason] = "#{rate_info}. #{next_run}" + false + end + end + + private + + def parse_run_frequency(frequency) + case frequency.to_s.downcase + when /(\d+)\s+times?\s+per\s+(day|hour)/i + max_runs, period = $1.to_i, $2 + if max_runs <= 0 + raise ArgumentError, "Number of runs must be positive" + end + if max_runs > 1000 + raise ArgumentError, "Maximum 1000 runs per period allowed" + end + { max_runs: max_runs, period: period } + else + raise ArgumentError, "Unsupported frequency format. Use 'N times per day' or 'N times per hour'" + end + end + + def current_period(rate) + case rate[:period] + when 'day' + Time.current.strftime('%Y-%m-%d') + when 'hour' + Time.current.strftime('%Y-%m-%d-%H') + end + end + + def new_period?(last_run, rate) + case rate[:period] + when 'day' + !last_run.to_date.equal?(Time.current.to_date) + when 'hour' + last_run.strftime('%Y-%m-%d-%H') != Time.current.strftime('%Y-%m-%d-%H') + end + end + + def next_period_start(rate) + case rate[:period] + when 'day' + Time.current.beginning_of_day + 1.day + when 'hour' + Time.current.beginning_of_hour + 1.hour + else + raise ArgumentError, "Unsupported period: #{rate[:period]}" + end + end end class CheckRunner diff --git a/lib/allgood/version.rb b/lib/allgood/version.rb index dbb6c8c..23cffa9 100644 --- a/lib/allgood/version.rb +++ b/lib/allgood/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module Allgood - VERSION = "0.2.0" + VERSION = "0.3.0" end