From 8656879ec23b4a2bf721f8d841b4dbe3ca17c06d Mon Sep 17 00:00:00 2001 From: kmo-ev Date: Tue, 26 Apr 2022 07:56:49 -0700 Subject: [PATCH] dl-kaltura up --- .gitignore | 164 ++++++++++++++++++++++++++++++ api/__init__.py | 0 api/auth.py | 91 +++++++++++++++++ dl-kaltura.py | 261 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 516 insertions(+) create mode 100644 .gitignore create mode 100644 api/__init__.py create mode 100644 api/auth.py create mode 100644 dl-kaltura.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7ff9c93 --- /dev/null +++ b/.gitignore @@ -0,0 +1,164 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Driver +geckodriver.exe +cookies.pkl + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/auth.py b/api/auth.py new file mode 100644 index 0000000..935b157 --- /dev/null +++ b/api/auth.py @@ -0,0 +1,91 @@ +## +## lib/auth.py +## Provides functions for Calnet authentication over +## Selenium, including cookies export. +## +## Copyright (c) 2022 Kevin Mo +## + +import pickle +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +COOKIES_NAME = "cookies.pkl" + +def save_cookies(driver): + """ + Save cookies given a Selenium driver. + """ + pickle.dump(driver.get_cookies(), open(COOKIES_NAME, "wb")) + return True + +def load_cookies(driver): + """ + Load cookies given a Selenium driver. + """ + try: + cookies = pickle.load(open(COOKIES_NAME, "rb")) + for cookie in cookies: + driver.add_cookie(cookie) + except IOError: + pass + + return driver + +def check_calnet_auth(driver): + """ + Returns whether CalNet authentication is ready. + """ + return "auth.berkeley.edu" in driver.current_url + +def perform_calnet_auth(driver, cid, pwd): + """ + Perform authentication with CalNet, given + that driver is at auth.berkeley.edu. + """ + print("Performing Calnet authentication") + print("Current URL:", driver.current_url) + + if not check_calnet_auth(driver): + raise Exception("Cannot perform authentication at current state.") + + # Target user input and then submit + cid_box = driver.find_element_by_id("username") + pwd_box = driver.find_element_by_id("password") + submit_box = driver.find_element_by_id("submit") + + cid_box.send_keys(cid) + pwd_box.send_keys(pwd) + submit_box.click() + + # Handle incorrect login attempt + if "auth.berkeley.edu" in driver.current_url: + print("Incorrect login attempt detected.") + raise Exception("Incorrect CalNet credentials") + + # Check the presence of Duo 2FA + try: + if "duosecurity.com" not in driver.current_url: + return True + + print() + print("IMPORTANT: Complete the 2FA process on the automated browser.") + print("Complete the process through push notification, security key, etc.") + print() + + # Stall for success or trust box + wait = WebDriverWait(driver, 30) + trust_box = wait.until(EC.element_to_be_clickable((By.ID, "trust-browser-button"))) + print("Detected trust browser button, clicking...") + trust_box.click() + + wait = WebDriverWait(driver, 10) + if not wait.until(EC.url_contains("duosecurity.com")): + raise Exception("2FA did not complete successfully") + else: + print("Detected redirect, authentication is a success!") + except: + print("Timed out or could not locate Duo 2FA prompt.") + + \ No newline at end of file diff --git a/dl-kaltura.py b/dl-kaltura.py new file mode 100644 index 0000000..b9561d4 --- /dev/null +++ b/dl-kaltura.py @@ -0,0 +1,261 @@ +## +## cli/dl-kaltura.py +## Downloads videos from Kaltura (external tool #78985) with +## Selenium WebDriver informed by the bCourses API. +## +## Starts a Selenium session, authenticates as a Calnet user, +## asks for a course to download from, then downloads all +## found videos to a target location. +## +## Copyright (c) 2022 Kevin Mo +## + +## Libraries +import time +import re +import os +import requests +# from selenium import webdriver +from seleniumwire import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from getpass import getpass +import json +from api import auth +from tqdm import tqdm, trange +from tqdm.contrib.concurrent import thread_map + +## STEP 0: Common variables +DATA_LOCATION = "D:/Class/Lectures" + +## STEP 1: Start session and load cookies +options = Options() +options.add_argument("--headless") +driver = webdriver.Firefox(options=options) +driver.implicitly_wait(10) + +## STEP 2: Direct to bcourses, then check auth.berkeley.edu +BCOURSES_URL = "https://bcourses.berkeley.edu" +driver.get("https://bcourses.berkeley.edu") +if auth.check_calnet_auth(driver): + # Prompt user for username + uid = input("CalNet ID: ") + pwd = getpass("CalNet password: ") + print("Performing authentication using credentials...") + auth.perform_calnet_auth(driver, uid, pwd) + +wait = WebDriverWait(driver, 10) +if not wait.until(EC.url_contains(BCOURSES_URL)): + raise Exception("Could not navigate to bCourses url") + +## STEP 3: Navigate to bCourses API and parse JSON +BCOURSES_API = "https://bcourses.berkeley.edu/api/v1/courses" +page_num = 1 +selected_class = None + +while True: + driver.get(BCOURSES_API + "?page=" + str(page_num)) + + json_content = driver.find_element(By.ID, "json").text + all_classes = json.loads(json_content) + + if not isinstance(all_classes, list): + print("Received unexpected JSON response from bCourses API.") + + print() + print("Select a class to download from:") + for i, bclass in enumerate(all_classes): + if "name" not in bclass: + bclass["name"] = "Unknown class" + print(str(i+1) + ") " + bclass["name"] + " (class ID #" + str(bclass["id"]) + ")") + + class_option = input("Select an numbered option ('b' for last page, 'n' for next page): ") + + if class_option == 'b': + page_num = max(1, page_num-1) + elif class_option == 'n': + page_num = page_num + 1 + else: + try: + class_idx = int(class_option) + selected_class = all_classes[class_idx-1] + break + except: + print("Invalid input or numbered option!") + +if selected_class is None: + raise Exception("No class ID found.") + +print("\nSelected " + selected_class["name"]) + +## STEP 3.5: Initialize directory for video transfer +final_dir = DATA_LOCATION + "/" + re.sub(r'[^\w\s\(\)]', '', selected_class["name"]) +os.makedirs(final_dir, exist_ok=False) + +## STEP 4: Navigate to Kaltura and pull all videos +driver.get(BCOURSES_URL + "/courses/" + str(selected_class["id"]) + "/external_tools/78985") +kaltura_frame = driver.find_element(By.ID, "tool_content") +driver.switch_to.frame(kaltura_frame) + +# Fetch more gallery items by clicking on +# the more button +print("Fetching all videos from Kaltura library...") +try: + while True: + expand_more = driver.find_element(By.CSS_SELECTOR, ".endless-scroll-more > .btn") + expand_more.click() + time.sleep(3) +except: + print("Expanded all videos from the listing as possible.") + +gallery_elems = driver.find_elements(By.CLASS_NAME, "galleryItem") +num_elems = len(gallery_elems) + +class GalleryItem: + def __init__(self, elem, index=-1): + self.index = index + self.title = elem.find_element(By.CLASS_NAME, "thumb_name_content").text + self.author = elem.find_element(By.CLASS_NAME, "userLink").text + self.date_added = elem.find_element(By.CSS_SELECTOR, ".thumbTimeAdded > span > span").text + self.thumbnail = elem.find_element(By.CLASS_NAME, "thumb_img").get_attribute("src") + self.video_url = elem.find_element(By.CLASS_NAME, "item_link").get_attribute("href") + self.download_urls = {} + self.srt_urls = {} + self.download_path = None + self.processed = False + self.downloaded = False + + def __str__(self): + return "(#" + self.str_index() + ") " + self.title + " - " + self.author + + def get_folder_name(self): + return re.sub(r'[^\w\s\(\)]', '', str(self)) + + def str_index(self): + if self.index < 0: + return "UNK" + return f'{self.index:03}' + +gallery_items = [GalleryItem(g, index=num_elems-i) for i, g in enumerate(gallery_elems)] + +print("This tool will download", len(gallery_items), "videos from the Kaltura gallery.\n") +driver.switch_to.parent_frame() + +# Regex patterns +re_vid = re.compile(r"\/(scf\/hls)\/p\/(\d+)\/sp\/(\d+)\/serveFlavor\/entryId\/(\w+)\/v\/\d+\/ev\/\d+\/flavorId\/(\w+)\/name\/([\w\.]+)\/seg-(\d+)-[\w\-]+.ts") +re_str = re.compile(r"\/api_v3\/index.php\/service\/caption_captionAsset\/action\/serve\/captionAssetId\/(\w+)\/ks\/([\w\-]+)\/.srt") + +print("Now processing detailed metadata and download links for all videos.") +print("This process will take a while to complete.") + +def process_gallery_item(gallery_item): + """ + Gather full information for each video and + find video + srt links from browser requests. + """ + # Read in requests + def read_requests(request): + if request.host == "cfvod.kaltura.com": + vid_match = re_vid.match(request.path) + srt_match = re_str.match(request.path) + + if vid_match: + gallery_item.download_urls[vid_match.group(4) + '.mp4'] = request.url.replace("/scf/hls/", "/pd/") + elif srt_match: + gallery_item.srt_urls[srt_match.group(1) + '.srt'] = request.url + + # Reset all requests to proxy + del driver.requests + driver.request_interceptor = read_requests + driver.get(gallery_item.video_url) + + gallery_item.author = driver.find_element(By.CLASS_NAME, "userLink").text + gallery_item.date_added = driver.find_element(By.CSS_SELECTOR, "#js-entry-create-at > span").text + + play_frame = driver.find_element(By.ID, "kplayer_ifp") + driver.switch_to.frame(play_frame) + + play_button = driver.find_element(By.CLASS_NAME, "largePlayBtn") + play_button.click() + + # Open quality options + settings_button = driver.find_element(By.CSS_SELECTOR, ".sourceSelector > button") + settings_button.click() + + """ + pbar = trange(50, position=1) + for _ in pbar: + time.sleep(0.1) + pbar.set_description("Reading network") + """ + time.sleep(3) + + gallery_item.processed = True + del driver.request_interceptor + +def print_gallery_item(gallery_item): + print(str(gallery_item)) + print(gallery_item.date_added) + print(gallery_item.thumbnail) + print(gallery_item.video_url) + print(gallery_item.download_urls) + print(gallery_item.srt_urls) + +with tqdm(gallery_items, position=0) as pbar: + for gallery_item in pbar: + pbar.set_description("Processing '" + gallery_item.title + "'") + process_gallery_item(gallery_item) + +print() +print("All links have finished pre-processing.") + +print() +print("Creating the required folders for each lecture and") +print("saving pre-processed metadata...") +with tqdm(gallery_items, position=0) as pbar: + for gallery_item in pbar: + pbar.set_description("Allocating '" + gallery_item.title + "'") + # Create the subfolder + dl_path = final_dir + "/" + gallery_item.get_folder_name() + os.makedirs(dl_path) + gallery_item.download_path = dl_path + + with open(dl_path + "/download.json", 'w') as f: + f.write(json.dumps(gallery_item.__dict__, indent=4)) + +# Close the driver connection +driver.close() + +## STEP 5: Download all the lecture data in parallel +NUM_PARALLEL = 5 + +print() +print("Downloading lecture data in parallel (# of streams: " + str(NUM_PARALLEL) + ").") +print("This process will take very long depending on your internet speed,") +print("but should take less time to finish towards the end.") + +def download_lecture(gallery_item): + dl_path = gallery_item.download_path + # Download all subtitles + dl_list = {**gallery_item.download_urls, **gallery_item.srt_urls} + for fname in dl_list: + if not os.path.exists(dl_path + "/" + fname): + r = requests.get(dl_list[fname], stream=True) + if r.status_code == 200: + with open(dl_path + "/" + fname, "wb") as f: + for chunk in r: + f.write(chunk) + else: + print("Encountered unexpected status code", r.status_code, "while downloading", fname) + + gallery_item.downloaded = True + return gallery_item + +next_items = thread_map(download_lecture, gallery_items, max_workers=NUM_PARALLEL) + +print() +print("Status Report:") +print("Total videos downloaded", sum([i.downloaded for i in next_items]), "out of", len(next_items)) +print("Tool has finished execution.")