dl-kaltura up

2 years ago · 8656879ec2
4 changed files with 516 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,164 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Driver
+geckodriver.exe
+cookies.pkl
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
--- a/api/init.py
+++ b/api/init.py
--- a/api/auth.py
+++ b/api/auth.py
@ -0,0 +1,91 @@
+## 
+## lib/auth.py
+## Provides functions for Calnet authentication over
+## Selenium, including cookies export.
+##
+## Copyright (c) 2022 Kevin Mo
+##
+
+import pickle
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+COOKIES_NAME = "cookies.pkl"
+
+def save_cookies(driver):
+    """
+    Save cookies given a Selenium driver.
+    """
+    pickle.dump(driver.get_cookies(), open(COOKIES_NAME, "wb"))
+    return True
+
+def load_cookies(driver):
+    """
+    Load cookies given a Selenium driver.
+    """
+    try:
+        cookies = pickle.load(open(COOKIES_NAME, "rb"))
+        for cookie in cookies:
+            driver.add_cookie(cookie)
+    except IOError:
+        pass
+
+    return driver
+
+def check_calnet_auth(driver):
+    """
+    Returns whether CalNet authentication is ready.
+    """
+    return "auth.berkeley.edu" in driver.current_url
+
+def perform_calnet_auth(driver, cid, pwd):
+    """
+    Perform authentication with CalNet, given
+    that driver is at auth.berkeley.edu.
+    """
+    print("Performing Calnet authentication")
+    print("Current URL:", driver.current_url)
+
+    if not check_calnet_auth(driver):
+        raise Exception("Cannot perform authentication at current state.")
+
+    # Target user input and then submit
+    cid_box = driver.find_element_by_id("username")
+    pwd_box = driver.find_element_by_id("password")
+    submit_box = driver.find_element_by_id("submit")
+
+    cid_box.send_keys(cid)
+    pwd_box.send_keys(pwd)
+    submit_box.click()
+
+    # Handle incorrect login attempt
+    if "auth.berkeley.edu" in driver.current_url:
+        print("Incorrect login attempt detected.")
+        raise Exception("Incorrect CalNet credentials")
+
+    # Check the presence of Duo 2FA
+    try:
+        if "duosecurity.com" not in driver.current_url:
+            return True
+
+        print()
+        print("IMPORTANT: Complete the 2FA process on the automated browser.")
+        print("Complete the process through push notification, security key, etc.")
+        print()
+
+        # Stall for success or trust box
+        wait = WebDriverWait(driver, 30)
+        trust_box = wait.until(EC.element_to_be_clickable((By.ID, "trust-browser-button")))
+        print("Detected trust browser button, clicking...")
+        trust_box.click()
+
+        wait = WebDriverWait(driver, 10)
+        if not wait.until(EC.url_contains("duosecurity.com")):
+            raise Exception("2FA did not complete successfully")
+        else:
+            print("Detected redirect, authentication is a success!")
+    except:
+        print("Timed out or could not locate Duo 2FA prompt.")
+
+    
--- a/dl-kaltura.py
+++ b/dl-kaltura.py
@ -0,0 +1,261 @@
+## 
+## cli/dl-kaltura.py
+## Downloads videos from Kaltura (external tool #78985) with
+## Selenium WebDriver informed by the bCourses API.
+##
+## Starts a Selenium session, authenticates as a Calnet user, 
+## asks for a course to download from, then downloads all
+## found videos to a target location.
+##
+## Copyright (c) 2022 Kevin Mo
+##
+
+## Libraries
+import time
+import re
+import os
+import requests
+# from selenium import webdriver
+from seleniumwire import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from getpass import getpass
+import json
+from api import auth
+from tqdm import tqdm, trange
+from tqdm.contrib.concurrent import thread_map
+
+## STEP 0: Common variables
+DATA_LOCATION = "D:/Class/Lectures"
+
+## STEP 1: Start session and load cookies
+options = Options()
+options.add_argument("--headless")
+driver = webdriver.Firefox(options=options)
+driver.implicitly_wait(10)
+
+## STEP 2: Direct to bcourses, then check auth.berkeley.edu
+BCOURSES_URL = "https://bcourses.berkeley.edu"
+driver.get("https://bcourses.berkeley.edu")
+if auth.check_calnet_auth(driver):
+    # Prompt user for username
+    uid = input("CalNet ID: ")
+    pwd = getpass("CalNet password: ")
+    print("Performing authentication using credentials...")
+    auth.perform_calnet_auth(driver, uid, pwd)
+
+wait = WebDriverWait(driver, 10)
+if not wait.until(EC.url_contains(BCOURSES_URL)):
+    raise Exception("Could not navigate to bCourses url")
+
+## STEP 3: Navigate to bCourses API and parse JSON
+BCOURSES_API = "https://bcourses.berkeley.edu/api/v1/courses"
+page_num = 1
+selected_class = None
+
+while True:
+    driver.get(BCOURSES_API + "?page=" + str(page_num))
+
+    json_content = driver.find_element(By.ID, "json").text
+    all_classes = json.loads(json_content)
+
+    if not isinstance(all_classes, list):
+        print("Received unexpected JSON response from bCourses API.")
+    
+    print()
+    print("Select a class to download from:")
+    for i, bclass in enumerate(all_classes):
+        if "name" not in bclass:
+            bclass["name"] = "Unknown class"
+        print(str(i+1) + ") " + bclass["name"] + " (class ID #" + str(bclass["id"]) + ")")
+
+    class_option = input("Select an numbered option ('b' for last page, 'n' for next page): ")
+
+    if class_option == 'b':
+        page_num = max(1, page_num-1)
+    elif class_option == 'n':
+        page_num = page_num + 1
+    else:
+        try:
+            class_idx = int(class_option)
+            selected_class = all_classes[class_idx-1]
+            break
+        except:
+            print("Invalid input or numbered option!")
+
+if selected_class is None:
+    raise Exception("No class ID found.")
+
+print("\nSelected " + selected_class["name"])
+
+## STEP 3.5: Initialize directory for video transfer
+final_dir = DATA_LOCATION + "/" + re.sub(r'[^\w\s\(\)]', '', selected_class["name"])
+os.makedirs(final_dir, exist_ok=False)
+
+## STEP 4: Navigate to Kaltura and pull all videos
+driver.get(BCOURSES_URL + "/courses/" + str(selected_class["id"]) + "/external_tools/78985")
+kaltura_frame = driver.find_element(By.ID, "tool_content")
+driver.switch_to.frame(kaltura_frame)
+
+# Fetch more gallery items by clicking on
+# the more button
+print("Fetching all videos from Kaltura library...")
+try:
+    while True:
+        expand_more = driver.find_element(By.CSS_SELECTOR, ".endless-scroll-more > .btn")
+        expand_more.click()
+        time.sleep(3)
+except:
+    print("Expanded all videos from the listing as possible.")
+
+gallery_elems = driver.find_elements(By.CLASS_NAME, "galleryItem")
+num_elems = len(gallery_elems)
+
+class GalleryItem:
+    def __init__(self, elem, index=-1):
+        self.index = index
+        self.title = elem.find_element(By.CLASS_NAME, "thumb_name_content").text
+        self.author = elem.find_element(By.CLASS_NAME, "userLink").text
+        self.date_added = elem.find_element(By.CSS_SELECTOR, ".thumbTimeAdded > span > span").text
+        self.thumbnail = elem.find_element(By.CLASS_NAME, "thumb_img").get_attribute("src")
+        self.video_url = elem.find_element(By.CLASS_NAME, "item_link").get_attribute("href")
+        self.download_urls = {}
+        self.srt_urls = {}
+        self.download_path = None
+        self.processed = False
+        self.downloaded = False
+
+    def __str__(self):
+        return "(#" + self.str_index() + ") " + self.title + " - " + self.author
+
+    def get_folder_name(self):
+        return re.sub(r'[^\w\s\(\)]', '', str(self))
+
+    def str_index(self):
+        if self.index < 0:
+            return "UNK"
+        return f'{self.index:03}'
+
+gallery_items = [GalleryItem(g, index=num_elems-i) for i, g in enumerate(gallery_elems)]
+
+print("This tool will download", len(gallery_items), "videos from the Kaltura gallery.\n")
+driver.switch_to.parent_frame()
+
+# Regex patterns
+re_vid = re.compile(r"\/(scf\/hls)\/p\/(\d+)\/sp\/(\d+)\/serveFlavor\/entryId\/(\w+)\/v\/\d+\/ev\/\d+\/flavorId\/(\w+)\/name\/([\w\.]+)\/seg-(\d+)-[\w\-]+.ts")
+re_str = re.compile(r"\/api_v3\/index.php\/service\/caption_captionAsset\/action\/serve\/captionAssetId\/(\w+)\/ks\/([\w\-]+)\/.srt")
+
+print("Now processing detailed metadata and download links for all videos.")
+print("This process will take a while to complete.")
+
+def process_gallery_item(gallery_item):
+    """
+    Gather full information for each video and
+    find video + srt links from browser requests.
+    """
+    # Read in requests
+    def read_requests(request):
+        if request.host == "cfvod.kaltura.com":
+            vid_match = re_vid.match(request.path)
+            srt_match = re_str.match(request.path)
+
+            if vid_match:
+                gallery_item.download_urls[vid_match.group(4) + '.mp4'] = request.url.replace("/scf/hls/", "/pd/")
+            elif srt_match:
+                gallery_item.srt_urls[srt_match.group(1) + '.srt'] = request.url
+            
+    # Reset all requests to proxy
+    del driver.requests
+    driver.request_interceptor = read_requests
+    driver.get(gallery_item.video_url)
+
+    gallery_item.author = driver.find_element(By.CLASS_NAME, "userLink").text
+    gallery_item.date_added = driver.find_element(By.CSS_SELECTOR, "#js-entry-create-at > span").text
+
+    play_frame = driver.find_element(By.ID, "kplayer_ifp")
+    driver.switch_to.frame(play_frame)
+
+    play_button = driver.find_element(By.CLASS_NAME, "largePlayBtn")
+    play_button.click()
+
+    # Open quality options
+    settings_button = driver.find_element(By.CSS_SELECTOR, ".sourceSelector > button")
+    settings_button.click()
+    
+    """
+    pbar = trange(50, position=1)
+    for _ in pbar:
+        time.sleep(0.1)
+        pbar.set_description("Reading network")
+    """
+    time.sleep(3)
+
+    gallery_item.processed = True
+    del driver.request_interceptor
+
+def print_gallery_item(gallery_item):
+    print(str(gallery_item))
+    print(gallery_item.date_added)
+    print(gallery_item.thumbnail)
+    print(gallery_item.video_url)
+    print(gallery_item.download_urls)
+    print(gallery_item.srt_urls)
+
+with tqdm(gallery_items, position=0) as pbar:
+    for gallery_item in pbar:
+        pbar.set_description("Processing '" + gallery_item.title + "'")
+        process_gallery_item(gallery_item)
+
+print()
+print("All links have finished pre-processing.")
+
+print()
+print("Creating the required folders for each lecture and")
+print("saving pre-processed metadata...")
+with tqdm(gallery_items, position=0) as pbar:
+    for gallery_item in pbar:
+        pbar.set_description("Allocating '" + gallery_item.title + "'")
+        # Create the subfolder
+        dl_path = final_dir + "/" + gallery_item.get_folder_name()
+        os.makedirs(dl_path)
+        gallery_item.download_path = dl_path
+
+        with open(dl_path + "/download.json", 'w') as f:
+            f.write(json.dumps(gallery_item.__dict__, indent=4))
+
+# Close the driver connection
+driver.close()
+
+## STEP 5: Download all the lecture data in parallel
+NUM_PARALLEL = 5
+
+print()
+print("Downloading lecture data in parallel (# of streams: " + str(NUM_PARALLEL) + ").")
+print("This process will take very long depending on your internet speed,")
+print("but should take less time to finish towards the end.")
+
+def download_lecture(gallery_item):
+    dl_path = gallery_item.download_path
+    # Download all subtitles
+    dl_list = {**gallery_item.download_urls, **gallery_item.srt_urls}
+    for fname in dl_list:
+        if not os.path.exists(dl_path + "/" + fname):
+            r = requests.get(dl_list[fname], stream=True)
+            if r.status_code == 200:
+                with open(dl_path + "/" + fname, "wb") as f:
+                    for chunk in r:
+                        f.write(chunk)
+            else:
+                print("Encountered unexpected status code", r.status_code, "while downloading", fname)
+    
+    gallery_item.downloaded = True
+    return gallery_item
+
+next_items = thread_map(download_lecture, gallery_items, max_workers=NUM_PARALLEL)
+
+print()
+print("Status Report:")
+print("Total videos downloaded", sum([i.downloaded for i in next_items]), "out of", len(next_items))
+print("Tool has finished execution.")