-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.py
36 lines (27 loc) · 909 Bytes
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from colorama import Fore, Style, init
from src.crawler.crawler import crawler
# Initialize colorama
init(autoreset = True)
crawl = crawler("src\\vendor\\chromedriver-win64\\chromedriver.exe")
def main(url):
print(f"{Fore.YELLOW}{Style.BRIGHT}Crawling the web..")
links = [url]
try:
crawl.fetch(url, wait_time=5)
crawled_links = list(set(crawl.links))
links.extend(crawled_links)
links = list(set(links))
except Exception as e:
print(f"{Fore.RED}{Style.BRIGHT}\nStopping..")
print(e)
with open("data\\top-1m.txt", "r", encoding="utf-8") as f:
links = f.readlines()
for link in links:
try:
main(link.strip())
except KeyboardInterrupt:
break
print(f"{Fore.YELLOW}{Style.BRIGHT}Closing the crawler..")
crawl.close()
print(f"{Fore.YELLOW}{Style.BRIGHT}Saving the crawled data..")
crawl.save("data\\index.json")