dl-kaltura up

3 years ago · 8656879ec2
4 changed files with 516 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,164 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Driver
 geckodriver.exe
 cookies.pkl
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/api/init.py
+++ b/api/init.py
--- a/api/auth.py
+++ b/api/auth.py
@ -0,0 +1,91 @@
 ## 
 ## lib/auth.py
 ## Provides functions for Calnet authentication over
 ## Selenium, including cookies export.
 ##
 ## Copyright (c) 2022 Kevin Mo
 ##
 import pickle
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 COOKIES_NAME = "cookies.pkl"
 def save_cookies(driver):
    """
    Save cookies given a Selenium driver.
    """
    pickle.dump(driver.get_cookies(), open(COOKIES_NAME, "wb"))
    return True
 def load_cookies(driver):
    """
    Load cookies given a Selenium driver.
    """
    try:
        cookies = pickle.load(open(COOKIES_NAME, "rb"))
        for cookie in cookies:
            driver.add_cookie(cookie)
    except IOError:
        pass
    return driver
 def check_calnet_auth(driver):
    """
    Returns whether CalNet authentication is ready.
    """
    return "auth.berkeley.edu" in driver.current_url
 def perform_calnet_auth(driver, cid, pwd):
    """
    Perform authentication with CalNet, given
    that driver is at auth.berkeley.edu.
    """
    print("Performing Calnet authentication")
    print("Current URL:", driver.current_url)
    if not check_calnet_auth(driver):
        raise Exception("Cannot perform authentication at current state.")
    # Target user input and then submit
    cid_box = driver.find_element_by_id("username")
    pwd_box = driver.find_element_by_id("password")
    submit_box = driver.find_element_by_id("submit")
    cid_box.send_keys(cid)
    pwd_box.send_keys(pwd)
    submit_box.click()
    # Handle incorrect login attempt
    if "auth.berkeley.edu" in driver.current_url:
        print("Incorrect login attempt detected.")
        raise Exception("Incorrect CalNet credentials")
    # Check the presence of Duo 2FA
    try:
        if "duosecurity.com" not in driver.current_url:
            return True
        print()
        print("IMPORTANT: Complete the 2FA process on the automated browser.")
        print("Complete the process through push notification, security key, etc.")
        print()
        # Stall for success or trust box
        wait = WebDriverWait(driver, 30)
        trust_box = wait.until(EC.element_to_be_clickable((By.ID, "trust-browser-button")))
        print("Detected trust browser button, clicking...")
        trust_box.click()
        wait = WebDriverWait(driver, 10)
        if not wait.until(EC.url_contains("duosecurity.com")):
            raise Exception("2FA did not complete successfully")
        else:
            print("Detected redirect, authentication is a success!")
    except:
        print("Timed out or could not locate Duo 2FA prompt.")
--- a/dl-kaltura.py
+++ b/dl-kaltura.py
@ -0,0 +1,261 @@
 ## 
 ## cli/dl-kaltura.py
 ## Downloads videos from Kaltura (external tool #78985) with
 ## Selenium WebDriver informed by the bCourses API.
 ##
 ## Starts a Selenium session, authenticates as a Calnet user, 
 ## asks for a course to download from, then downloads all
 ## found videos to a target location.
 ##
 ## Copyright (c) 2022 Kevin Mo
 ##
 ## Libraries
 import time
 import re
 import os
 import requests
 # from selenium import webdriver
 from seleniumwire import webdriver
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from getpass import getpass
 import json
 from api import auth
 from tqdm import tqdm, trange
 from tqdm.contrib.concurrent import thread_map
 ## STEP 0: Common variables
 DATA_LOCATION = "D:/Class/Lectures"
 ## STEP 1: Start session and load cookies
 options = Options()
 options.add_argument("--headless")
 driver = webdriver.Firefox(options=options)
 driver.implicitly_wait(10)
 ## STEP 2: Direct to bcourses, then check auth.berkeley.edu
 BCOURSES_URL = "https://bcourses.berkeley.edu"
 driver.get("https://bcourses.berkeley.edu")
 if auth.check_calnet_auth(driver):
    # Prompt user for username
    uid = input("CalNet ID: ")
    pwd = getpass("CalNet password: ")
    print("Performing authentication using credentials...")
    auth.perform_calnet_auth(driver, uid, pwd)
 wait = WebDriverWait(driver, 10)
 if not wait.until(EC.url_contains(BCOURSES_URL)):
    raise Exception("Could not navigate to bCourses url")
 ## STEP 3: Navigate to bCourses API and parse JSON
 BCOURSES_API = "https://bcourses.berkeley.edu/api/v1/courses"
 page_num = 1
 selected_class = None
 while True:
    driver.get(BCOURSES_API + "?page=" + str(page_num))
    json_content = driver.find_element(By.ID, "json").text
    all_classes = json.loads(json_content)
    if not isinstance(all_classes, list):
        print("Received unexpected JSON response from bCourses API.")
    print()
    print("Select a class to download from:")
    for i, bclass in enumerate(all_classes):
        if "name" not in bclass:
            bclass["name"] = "Unknown class"
        print(str(i+1) + ") " + bclass["name"] + " (class ID #" + str(bclass["id"]) + ")")
    class_option = input("Select an numbered option ('b' for last page, 'n' for next page): ")
    if class_option == 'b':
        page_num = max(1, page_num-1)
    elif class_option == 'n':
        page_num = page_num + 1
    else:
        try:
            class_idx = int(class_option)
            selected_class = all_classes[class_idx-1]
            break
        except:
            print("Invalid input or numbered option!")
 if selected_class is None:
    raise Exception("No class ID found.")
 print("\nSelected " + selected_class["name"])
 ## STEP 3.5: Initialize directory for video transfer
 final_dir = DATA_LOCATION + "/" + re.sub(r'[^\w\s\(\)]', '', selected_class["name"])
 os.makedirs(final_dir, exist_ok=False)
 ## STEP 4: Navigate to Kaltura and pull all videos
 driver.get(BCOURSES_URL + "/courses/" + str(selected_class["id"]) + "/external_tools/78985")
 kaltura_frame = driver.find_element(By.ID, "tool_content")
 driver.switch_to.frame(kaltura_frame)
 # Fetch more gallery items by clicking on
 # the more button
 print("Fetching all videos from Kaltura library...")
 try:
    while True:
        expand_more = driver.find_element(By.CSS_SELECTOR, ".endless-scroll-more > .btn")
        expand_more.click()
        time.sleep(3)
 except:
    print("Expanded all videos from the listing as possible.")
 gallery_elems = driver.find_elements(By.CLASS_NAME, "galleryItem")
 num_elems = len(gallery_elems)
 class GalleryItem:
    def __init__(self, elem, index=-1):
        self.index = index
        self.title = elem.find_element(By.CLASS_NAME, "thumb_name_content").text
        self.author = elem.find_element(By.CLASS_NAME, "userLink").text
        self.date_added = elem.find_element(By.CSS_SELECTOR, ".thumbTimeAdded > span > span").text
        self.thumbnail = elem.find_element(By.CLASS_NAME, "thumb_img").get_attribute("src")
        self.video_url = elem.find_element(By.CLASS_NAME, "item_link").get_attribute("href")
        self.download_urls = {}
        self.srt_urls = {}
        self.download_path = None
        self.processed = False
        self.downloaded = False
    def __str__(self):
        return "(#" + self.str_index() + ") " + self.title + " - " + self.author
    def get_folder_name(self):
        return re.sub(r'[^\w\s\(\)]', '', str(self))
    def str_index(self):
        if self.index < 0:
            return "UNK"
        return f'{self.index:03}'
 gallery_items = [GalleryItem(g, index=num_elems-i) for i, g in enumerate(gallery_elems)]
 print("This tool will download", len(gallery_items), "videos from the Kaltura gallery.\n")
 driver.switch_to.parent_frame()
 # Regex patterns
 re_vid = re.compile(r"\/(scf\/hls)\/p\/(\d+)\/sp\/(\d+)\/serveFlavor\/entryId\/(\w+)\/v\/\d+\/ev\/\d+\/flavorId\/(\w+)\/name\/([\w\.]+)\/seg-(\d+)-[\w\-]+.ts")
 re_str = re.compile(r"\/api_v3\/index.php\/service\/caption_captionAsset\/action\/serve\/captionAssetId\/(\w+)\/ks\/([\w\-]+)\/.srt")
 print("Now processing detailed metadata and download links for all videos.")
 print("This process will take a while to complete.")
 def process_gallery_item(gallery_item):
    """
    Gather full information for each video and
    find video + srt links from browser requests.
    """
    # Read in requests
    def read_requests(request):
        if request.host == "cfvod.kaltura.com":
            vid_match = re_vid.match(request.path)
            srt_match = re_str.match(request.path)
            if vid_match:
                gallery_item.download_urls[vid_match.group(4) + '.mp4'] = request.url.replace("/scf/hls/", "/pd/")
            elif srt_match:
                gallery_item.srt_urls[srt_match.group(1) + '.srt'] = request.url
    # Reset all requests to proxy
    del driver.requests
    driver.request_interceptor = read_requests
    driver.get(gallery_item.video_url)
    gallery_item.author = driver.find_element(By.CLASS_NAME, "userLink").text
    gallery_item.date_added = driver.find_element(By.CSS_SELECTOR, "#js-entry-create-at > span").text
    play_frame = driver.find_element(By.ID, "kplayer_ifp")
    driver.switch_to.frame(play_frame)
    play_button = driver.find_element(By.CLASS_NAME, "largePlayBtn")
    play_button.click()
    # Open quality options
    settings_button = driver.find_element(By.CSS_SELECTOR, ".sourceSelector > button")
    settings_button.click()
    """
    pbar = trange(50, position=1)
    for _ in pbar:
        time.sleep(0.1)
        pbar.set_description("Reading network")
    """
    time.sleep(3)
    gallery_item.processed = True
    del driver.request_interceptor
 def print_gallery_item(gallery_item):
    print(str(gallery_item))
    print(gallery_item.date_added)
    print(gallery_item.thumbnail)
    print(gallery_item.video_url)
    print(gallery_item.download_urls)
    print(gallery_item.srt_urls)
 with tqdm(gallery_items, position=0) as pbar:
    for gallery_item in pbar:
        pbar.set_description("Processing '" + gallery_item.title + "'")
        process_gallery_item(gallery_item)
 print()
 print("All links have finished pre-processing.")
 print()
 print("Creating the required folders for each lecture and")
 print("saving pre-processed metadata...")
 with tqdm(gallery_items, position=0) as pbar:
    for gallery_item in pbar:
        pbar.set_description("Allocating '" + gallery_item.title + "'")
        # Create the subfolder
        dl_path = final_dir + "/" + gallery_item.get_folder_name()
        os.makedirs(dl_path)
        gallery_item.download_path = dl_path
        with open(dl_path + "/download.json", 'w') as f:
            f.write(json.dumps(gallery_item.__dict__, indent=4))
 # Close the driver connection
 driver.close()
 ## STEP 5: Download all the lecture data in parallel
 NUM_PARALLEL = 5
 print()
 print("Downloading lecture data in parallel (# of streams: " + str(NUM_PARALLEL) + ").")
 print("This process will take very long depending on your internet speed,")
 print("but should take less time to finish towards the end.")
 def download_lecture(gallery_item):
    dl_path = gallery_item.download_path
    # Download all subtitles
    dl_list = {**gallery_item.download_urls, **gallery_item.srt_urls}
    for fname in dl_list:
        if not os.path.exists(dl_path + "/" + fname):
            r = requests.get(dl_list[fname], stream=True)
            if r.status_code == 200:
                with open(dl_path + "/" + fname, "wb") as f:
                    for chunk in r:
                        f.write(chunk)
            else:
                print("Encountered unexpected status code", r.status_code, "while downloading", fname)
    gallery_item.downloaded = True
    return gallery_item
 next_items = thread_map(download_lecture, gallery_items, max_workers=NUM_PARALLEL)
 print()
 print("Status Report:")
 print("Total videos downloaded", sum([i.downloaded for i in next_items]), "out of", len(next_items))
 print("Tool has finished execution.")