From 5e0881c66c3b2744e98354845f98242ea03a27eb Mon Sep 17 00:00:00 2001 From: Sanketh Mopuru Date: Mon, 18 May 2015 14:06:09 +0530 Subject: [PATCH] More features added --- CHANGES.txt | 6 +++++- README.md | 1 + README.rst | 2 ++ coursera_offline.py | 35 ++++++++++++++++++++++++++++++----- setup.py | 4 ++-- 5 files changed, 40 insertions(+), 8 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 9d59cce..8a5483b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -14,4 +14,8 @@ Added support for windows --------------------VERSION 1.0.2---------------------- 1. Fixed a bug where -d option required the directory given as option to exist. Now the script creates the directory if it doesn't exist. -2. The directory path given for -d can now be relative and the script takes care of converting it to absolute path. \ No newline at end of file +2. The directory path given for -d can now be relative and the script takes care of converting it to absolute path. + +--------------------VERSION 1.1.0------------------------ +1. Script can now download lecture slides (pdfs and ppts). +2. Saves the data.json file prior to downloading. \ No newline at end of file diff --git a/README.md b/README.md index d0da541..64131d7 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,7 @@ Run `export HTTP_PROXY=http://user:password@address:port` and `export HTTPS_PROX * The script also downloads the subtitles automatically and saves them in *Subs* folder. So when you play the videos using VLC, the subs are automatically loaded. * The script saves a *data.json* file in the course directory. This has all the information required to fetch the videos. So even if you lose some videos or if you forget the course name, as long as you have the *data.json* file, you can always re-download the lost videos. * The script saves the session cookies so you don't have to login everytime you run it. +* The script fetches the lecture slides (pdfs and ppts) and saves them in 'Other Files' folder. ## Full Usage diff --git a/README.rst b/README.rst index 6d61d30..223cd9e 100644 --- a/README.rst +++ b/README.rst @@ -110,6 +110,8 @@ Features all the information required to fetch the videos. So even if you lose some videos or if you forget the course name, as long as you have the *data.json* file, you can always re-download the lost videos. +- The script saves the session cookies so you don't have to login everytime you run it. +- The script fetches the lecture slides (pdfs and ppts) and saves them in 'Other Files' folder. Full Usage ---------- diff --git a/coursera_offline.py b/coursera_offline.py index 16a1cae..afb1052 100755 --- a/coursera_offline.py +++ b/coursera_offline.py @@ -35,6 +35,7 @@ DATA_FILE = 'data.json' COOKIE_FILE = 'cookie.cookies' COURSE_DIR = os.getcwd() +OTHER_DIR = 'Other Files' class Downloader(threading.Thread): """Instance of threading.Thread class. @@ -67,6 +68,21 @@ def run(self): print 'Download finished for %s' % absolute_path(self.savepath) +def get_vid_sub_links(anchor_elems): + vid_link = None + sub_link = None + other_links = [] + for anchor_elem in anchor_elems: + temp = pq(anchor_elem) + href = temp.attr('href'); + if href.find('subtitles') != -1 and href.find('format=srt') != -1: + sub_link = href + elif href.find('download.mp4') != -1: + vid_link = href + elif href.find('.pdf') != -1 or href.find('.pptx') != -1: + other_links.append(href) + return vid_link, sub_link, other_links + def exit_with_message(msg): # Print the msg and exit the script print msg @@ -242,6 +258,7 @@ def download(parsed_json, cookie): week_count += 1 create_folder(folder_name) create_folder(os.path.join(folder_name, SUB_DIR)) + create_folder(os.path.join(folder_name, OTHER_DIR)) count = 0 for vid_info in sub_json['links']: @@ -269,6 +286,15 @@ def download(parsed_json, cookie): d = Downloader(suburl, sub_path, cookie, True) threads.append(d) + for other_link in vid_info['other_links']: + other_title = other_link.split('/')[-1] + other_path = os.path.join(folder_name, OTHER_DIR, str(count) + '-' + other_title) + if path_exists(other_path): + print 'Skipping %s' % other_path + else: + p = Downloader(other_link, other_path, cookie) + threads.append(p) + for thread in threads: thread.start() @@ -336,8 +362,8 @@ def main(): if not args.file: parsed_json = get_course_info(shortname, cookie_logged_in) - download(parsed_json, cookie_logged_in) save_data_file(parsed_json) + download(parsed_json, cookie_logged_in) if args.auto is not None: schedule_synch(args.auto, args.email, args.password) @@ -389,10 +415,9 @@ def get_course_info(shortname, cookie): for list_item in list_items: list_elem = pq(list_item) anchor_elems = list_elem('a') - vid_title = pq(anchor_elems[0]).text() - vid_link = pq(anchor_elems[len(anchor_elems) - 1]).attr('href') - sub_link = pq(anchor_elems[len(anchor_elems) - 2]).attr('href') - parsed_json['links'].append({'title':vid_title, 'link':vid_link, 'sub_link':sub_link}) + vid_title = pq(anchor_elems[0]).text() + vid_link, sub_link, other_links = get_vid_sub_links(anchor_elems) + parsed_json['links'].append({'title':vid_title, 'link':vid_link, 'sub_link':sub_link, 'other_links': other_links}) course_info_json['data'].append(parsed_json) except Exception, e: exit_with_message('Invalid HTML file receieved') diff --git a/setup.py b/setup.py index 963b20b..3873932 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name = "coursera_offline", - version = "1.0.2", + version = "1.1.0", author="Sanketh Mopuru", author_email="sanketh.mopuru@gmail.com", @@ -28,7 +28,7 @@ "docutils>=0.3" ], - keywords = "coursera offline download lecture lectures video videos", + keywords = "coursera offline download lecture lectures videos and slides", scripts=['coursera_offline.py'] ) \ No newline at end of file