-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
84 lines (70 loc) · 2.98 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import shutil
import requests
import logging
from bs4 import BeautifulSoup
# "settings"
OUTPUT_DIR = "./out/"
DEBUGGING = False
# Constants
URL_PLACEHOLDER = "?#?"
BASE_URL = "https://didattica.polito.it/portal/pls/portal/"
VC_BASE_URL = f"{BASE_URL}sviluppo.videolezioni.vis?cor={URL_PLACEHOLDER}"
def die(msg: str, code: int) -> None:
print(msg)
print("Press enter to exit")
input()
exit(code)
def get_page_urls(html: str) -> list[str]:
urls = []
soup = BeautifulSoup(html, features="html.parser")
elems = soup.find("div", {"id": "navbar_left_menu"}) # Get navbar...
elems = elems.find_all("li", {"class": "h5"}) # ... get all h5 tags ...
logging.info(f"Found {len(elems)} links")
for i in elems:
partial_url = i.find("a")["href"] # ... finally get the hrefs from the <a> children
url = f"{BASE_URL}{partial_url}" # Transform to full URL
urls.append(url)
logging.info(f"Added {url} to list")
return urls
def get_video_urls(page_urls: list[str], s: requests.Session) -> list[str]:
urls = []
for i in page_urls:
logging.info(f"Visiting {i} to retrieve video URL")
r = s.get(i) # Visit page...
soup = BeautifulSoup(r.text, features="html.parser")
video_url = soup.find("video").find("source")["src"]
urls.append(video_url) # ...and get <source> inside <video>
logging.info(f"Added {video_url} to list")
return urls
def download_videos(video_urls: list[str], s: requests.Session, course: str) -> None:
tot_count = len(video_urls)
if not os.path.exists(OUTPUT_DIR): # Check if main output subdirectory exists and create it if not
os.makedirs(OUTPUT_DIR)
if not os.path.exists(f"{OUTPUT_DIR}{course}"): # Check if subdirectory for course exists and create it if not
os.makedirs(f"{OUTPUT_DIR}{course}")
for count, value in enumerate(video_urls):
print(f"Downloading video {count + 1}/{tot_count}")
filename = value.split("/")[-1] # Get filename from url
logging.info(f"Downloading {value}")
with s.get(value, stream=True) as r:
with open(f"{OUTPUT_DIR}{course}/{filename}", "wb") as f:
shutil.copyfileobj(r.raw, f)
def main():
if DEBUGGING:
logging.basicConfig(level=logging.DEBUG)
owa_session = input("Enter the owa_session cookie value: ")
course = input("Enter the course ID: ")
cookies = {"owa_session": owa_session}
s = requests.Session()
r = s.get(VC_BASE_URL.replace(URL_PLACEHOLDER, course), cookies=cookies)
# If anyone has a better way to check for incorrect values, let me know!
if "Access denied!" in r.text:
die("Access denied! Make sure the cookie value is correct!", 1)
elif "no data found" in r.text:
die("No data found! Make sure the course ID is correct!", 2)
page_urls = get_page_urls(r.text)
video_urls = get_video_urls(page_urls, s)
download_videos(video_urls, s, course)
die("Done!", 0)
main()