Skip to content

Commit

Permalink
Add a run: "2 times per day" option to rate limit some checks (exam…
Browse files Browse the repository at this point in the history
…ple: expensive API calls)
  • Loading branch information
rameerez committed Nov 12, 2024
1 parent 045ef20 commit d29b595
Show file tree
Hide file tree
Showing 7 changed files with 232 additions and 7 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## [0.3.0] - 2024-10-27

- Added rate limiting for expensive checks with the `run: "N times per day/hour"` option
- Added a cache mechanism to store check results and error states, which allows for rate limiting and avoiding redundant runs when checks fail
- Added automatic cache key expiration
- Added error handling and feedback for rate-limited checks

## [0.2.0] - 2024-10-26

- Improved the `allgood` DSL by adding optional conditionals on when individual checks are run
Expand Down
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,26 @@ check "Complex check",
end
```

### Rate Limiting Expensive Checks

For expensive operations (like testing paid APIs), you can limit how often checks run:

```ruby
# Run expensive checks a limited number of times
check "OpenAI is responding with a valid LLM message", run: "2 times per day" do
# expensive API call
end

check "Analytics can be processed", run: "4 times per hour" do
# expensive operation
end
```

Important notes:
- Rate limits reset at the start of each period (hour/day)
- The error state persists between rate-limited runs
- Rate-limited checks show clear feedback about remaining runs and next reset time

When a check is skipped due to its conditions not being met, it will appear in the healthcheck page with a skip emoji (⏭️) and a clear explanation of why it was skipped.

![Example dashboard of the Allgood health check page with skipped checks](allgood_skipped.webp)
Expand Down
54 changes: 48 additions & 6 deletions app/controllers/allgood/healthcheck_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,62 @@ def run_checks
end

def run_single_check(check)
last_result_key = "allgood:last_result:#{check[:name].parameterize}"
last_result = Allgood::CacheStore.instance.read(last_result_key)

unless Allgood.configuration.should_run_check?(check)
message = check[:skip_reason]
if last_result
status_info = "Last check #{last_result[:success] ? 'passed' : 'failed'} #{time_ago_in_words(last_result[:time])} ago: #{last_result[:message]}"
message = "#{message}. #{status_info}"
end

return {
name: check[:name],
success: last_result ? last_result[:success] : true,
skipped: true,
message: message,
duration: 0
}
end

start_time = Time.now
result = { success: false, message: "Check timed out after #{check[:timeout]} seconds" }
error_key = "allgood:error:#{check[:name].parameterize}"

begin
Timeout.timeout(check[:timeout]) do
check_result = Allgood.configuration.run_check(&check[:block])
result = { success: check_result[:success], message: check_result[:message] }

if result[:success]
# Clear error state and store successful result
Allgood::CacheStore.instance.write(error_key, nil)
Allgood::CacheStore.instance.write(last_result_key, {
success: true,
message: result[:message],
time: Time.current
})
end
end
rescue Timeout::Error, Allgood::CheckFailedError, StandardError => e
error_message = case e
when Timeout::Error
"Check timed out after #{check[:timeout]} seconds"
when Allgood::CheckFailedError
e.message
else
"Error: #{e.message}"
end
rescue Timeout::Error
# The result is already set to a timeout message
rescue Allgood::CheckFailedError => e
result = { success: false, message: e.message }
rescue StandardError => e
result = { success: false, message: "Error: #{e.message}" }

# Store error state and failed result
Allgood::CacheStore.instance.write(error_key, error_message)
Allgood::CacheStore.instance.write(last_result_key, {
success: false,
message: error_message,
time: Time.current
})
result = { success: false, message: error_message }
end

{
Expand Down
1 change: 1 addition & 0 deletions lib/allgood.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require_relative "allgood/version"
require_relative "allgood/engine"
require_relative "allgood/configuration"
require_relative "allgood/cache_store"

module Allgood
class Error < StandardError; end
Expand Down
52 changes: 52 additions & 0 deletions lib/allgood/cache_store.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# frozen_string_literal: true

module Allgood
class CacheStore
def self.instance
@instance ||= new
end

def initialize
@memory_store = {}
end

def read(key)
if rails_cache_available?
Rails.cache.read(key)
else
@memory_store[key]
end
end

def write(key, value)
if rails_cache_available?
expiry = key.include?('day') ? 1.day : 1.hour
Rails.cache.write(key, value, expires_in: expiry)
else
@memory_store[key] = value
end
end

def cleanup_old_keys
return unless rails_cache_available?

keys_pattern = "allgood:*"
if Rails.cache.respond_to?(:delete_matched)
Rails.cache.delete_matched("#{keys_pattern}:*:#{(Time.current - 2.days).strftime('%Y-%m-%d')}*")
end
rescue StandardError => e
Rails.logger.warn "Allgood: Failed to cleanup old cache keys: #{e.message}"
end

private

def rails_cache_available?
Rails.cache && Rails.cache.respond_to?(:read) && Rails.cache.respond_to?(:write) &&
Rails.cache.write("allgood_rails_cache_test_ok", "true") &&
Rails.cache.read("allgood_rails_cache_test_ok") == "true"
rescue StandardError => e
Rails.logger.warn "Allgood: Rails.cache not available (#{e.message}), falling back to memory store"
false
end
end
end
103 changes: 103 additions & 0 deletions lib/allgood/configuration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@ def check(name, **options, &block)
status: :pending
}

# Handle rate limiting
if options[:run]
begin
check_info[:rate] = parse_run_frequency(options[:run])
rescue ArgumentError => e
check_info[:status] = :skipped
check_info[:skip_reason] = "Invalid run frequency: #{e.message}"
@checks << check_info
return
end
end

# Handle environment-specific options
if options[:only]
environments = Array(options[:only])
Expand Down Expand Up @@ -66,6 +78,97 @@ def check(name, **options, &block)
def run_check(&block)
CheckRunner.new.instance_eval(&block)
end

def should_run_check?(check)
return true unless check[:rate]

cache_key = "allgood:last_run:#{check[:name].parameterize}"
runs_key = "allgood:runs_count:#{check[:name].parameterize}:#{current_period(check[:rate])}"
error_key = "allgood:error:#{check[:name].parameterize}"
last_result_key = "allgood:last_result:#{check[:name].parameterize}"

last_run = Allgood::CacheStore.instance.read(cache_key)
period_runs = Allgood::CacheStore.instance.read(runs_key).to_i
last_result = Allgood::CacheStore.instance.read(last_result_key)

current_period_key = current_period(check[:rate])
stored_period = Allgood::CacheStore.instance.read("allgood:current_period:#{check[:name].parameterize}")

# If we're in a new period, reset the counter
if stored_period != current_period_key
period_runs = 0
Allgood::CacheStore.instance.write("allgood:current_period:#{check[:name].parameterize}", current_period_key)
Allgood::CacheStore.instance.write(runs_key, 0)
end

# If there's an error, wait until next period
if previous_error = Allgood::CacheStore.instance.read(error_key)
next_period = next_period_start(check[:rate])
rate_info = "Rate limited (#{period_runs}/#{check[:rate][:max_runs]} runs this #{check[:rate][:period]})"
check[:skip_reason] = "#{rate_info}. Waiting until #{next_period.strftime('%H:%M:%S %Z')} to retry failed check"
return false
end

# If we haven't exceeded the max runs for this period
if period_runs < check[:rate][:max_runs]
Allgood::CacheStore.instance.write(cache_key, Time.current)
Allgood::CacheStore.instance.write(runs_key, period_runs + 1)
true
else
next_period = next_period_start(check[:rate])
rate_info = "Rate limited (#{period_runs}/#{check[:rate][:max_runs]} runs this #{check[:rate][:period]})"
next_run = "Next check at #{next_period.strftime('%H:%M:%S %Z')}"
check[:skip_reason] = "#{rate_info}. #{next_run}"
false
end
end

private

def parse_run_frequency(frequency)
case frequency.to_s.downcase
when /(\d+)\s+times?\s+per\s+(day|hour)/i
max_runs, period = $1.to_i, $2
if max_runs <= 0
raise ArgumentError, "Number of runs must be positive"
end
if max_runs > 1000
raise ArgumentError, "Maximum 1000 runs per period allowed"
end
{ max_runs: max_runs, period: period }
else
raise ArgumentError, "Unsupported frequency format. Use 'N times per day' or 'N times per hour'"
end
end

def current_period(rate)
case rate[:period]
when 'day'
Time.current.strftime('%Y-%m-%d')
when 'hour'
Time.current.strftime('%Y-%m-%d-%H')
end
end

def new_period?(last_run, rate)
case rate[:period]
when 'day'
!last_run.to_date.equal?(Time.current.to_date)
when 'hour'
last_run.strftime('%Y-%m-%d-%H') != Time.current.strftime('%Y-%m-%d-%H')
end
end

def next_period_start(rate)
case rate[:period]
when 'day'
Time.current.beginning_of_day + 1.day
when 'hour'
Time.current.beginning_of_hour + 1.hour
else
raise ArgumentError, "Unsupported period: #{rate[:period]}"
end
end
end

class CheckRunner
Expand Down
2 changes: 1 addition & 1 deletion lib/allgood/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

module Allgood
VERSION = "0.2.0"
VERSION = "0.3.0"
end

0 comments on commit d29b595

Please sign in to comment.