-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.rb
118 lines (94 loc) · 3.01 KB
/
crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
'''
Author: EDLyonhart
from tutorials:
https://www.youtube.com/watch?v=mMHflTR-MuY
https://www.youtube.com/watch?v=W_rEl19WIDg
https://www.youtube.com/watch?v=WOkysoDl6SA
Date: 10 October 2016
'''
require 'mechanize'
class WebCrawler
# Open and read 'file_name' (listed below)
def initialize(file_name)
@file = file_name
end
private
# use 'check' method to only add unique URLs to file
def save_site_crawl(site_url)
begin
if check(site_url)
File.open(@file, "a") do |data|
data.puts site_url
end
end
rescue StandardError => error_message
puts "ERROR: #{error_message}"
end
end
# confirm that url is/isn't present in @file
def check(url)
data = File.read(@file)
urls = data.split
if urls.include? url
return false
else
return true
end
end
def fetch_database_urls
active_urls = File.read(@file)
urls = active_urls.split
return urls
end
public
def crawl
links_found = 0
agent = Mechanize.new
agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
# get each url from file
fetched_urls = fetch_database_urls()
# each line of imported file to check within
fetched_urls.each do |url_to_crawl|
begin
page = agent.get(url_to_crawl)
links = page.links
links.each do |link|
# search for any 'href' value within returned 'link.attribute' xml object
scraped_url = link.attributes['href']
# ignore if href=#
next if scraped_url == "#"
# check for each opening case
case scraped_url[0..4]
when "https" then
save_site_crawl(scraped_url)
puts "Checking: #{scraped_url}\n---------------------------------------------\n"
when "http:" then
save_site_crawl(scraped_url)
puts "Checking: #{scraped_url}\n---------------------------------------------\n"
when "ftp:/" then
save_site_crawl(scraped_url)
puts "Checking: #{scraped_url}\n---------------------------------------------\n"
else
url_split = url_to_crawl.split("/")
p "url_split = #{url_split}"
# if scraped_url is a relative link (eg: '/home') do the following
if scraped_url[0] == "/"
# example: url_split = [\"https:\", \"\", \"services.bostonglobe.com\", \"pwd\", \"reset.asp\"]
final_url = url_split[0] + "//" + url_split[2] + scraped_url
else
final_url = url_split[0] + "//" + url_split[2] + "/" + scraped_url
end
save_site_crawl(final_url)
p "Checked: #{file}\n - - - - - - - - - - - - - - - - - \n"
end
links_found += 1
end
rescue StandardError => get_error
puts "Request Level Error: #{get_error}"
end
end
puts "Status Update:#{links_found} links found."
end
end
crawler = WebCrawler.new('./urls.txt')
crawler.crawl