improve UX

3 years ago · 2997c7c2d5
1 changed files with 43 additions and 10 deletions
--- a/dl-kaltura.py
+++ b/dl-kaltura.py
@ -11,9 +11,11 @@
 ##

 ## Libraries
+import shutil
 import time
 import re
 import os
+import sys
 import requests
 # from selenium import webdriver
 from seleniumwire import webdriver
@ -29,12 +31,22 @@ from tqdm.contrib.concurrent import thread_map

 ## STEP 0: Common variables
 DATA_LOCATION = "D:/Class/Lectures"
+IMPLICIT_WAIT_PERIOD = 10       # how long the driver should wait to find elements before quitting (general)
+IMPLICIT_LONG_PERIOD = 30       # how long the driver should wait to find elements before quitting (ajax/spa elements)
+LECTURE_FETCH_PERIOD = 10       # how long the driver should stream lectures to fetch all download links/resources
+FETCH_ALL_STALL_TIME = 3        # how long the driver should stall before expanding more through the expand button
+NUM_PARALLEL_THREADS = 5        # how many parallel threads to download lecture resources
+DRIVER_HEADLESS_MODE = True     # whether to run the driver in headless mode (browser does not show up)
+DRIVER_AUDIO_DISABLE = True     # whether to disable browser audio on the driver

 ## STEP 1: Start session and load cookies
 options = Options()
-options.add_argument("--headless")
+if DRIVER_HEADLESS_MODE:
+    options.add_argument("--headless")
+if DRIVER_AUDIO_DISABLE:
+    options.set_preference("media.volume_scale", "0.0")
 driver = webdriver.Firefox(options=options)
-driver.implicitly_wait(10)
+driver.implicitly_wait(IMPLICIT_WAIT_PERIOD)

 ## STEP 2: Direct to bcourses, then check auth.berkeley.edu
 BCOURSES_URL = "https://bcourses.berkeley.edu"
@ -46,7 +58,7 @@ if auth.check_calnet_auth(driver):
    print("Performing authentication using credentials...")
    auth.perform_calnet_auth(driver, uid, pwd)

-wait = WebDriverWait(driver, 10)
+wait = WebDriverWait(driver, IMPLICIT_WAIT_PERIOD)
 if not wait.until(EC.url_contains(BCOURSES_URL)):
    raise Exception("Could not navigate to bCourses url")

@ -92,6 +104,14 @@ print("\nSelected " + selected_class["name"])

 ## STEP 3.5: Initialize directory for video transfer
 final_dir = DATA_LOCATION + "/" + re.sub(r'[^\w\s\(\)]', '', selected_class["name"])
+if os.path.isdir(final_dir):
+    print("Folder already detected for this class in data directory.")
+    confirm_del = input("Delete this directory to proceed? (y/N): ")
+    if confirm_del.lower() != "y":
+        sys.exit(1)
+    print("Deleting class directory...")
+    shutil.rmtree(final_dir)
+
 os.makedirs(final_dir, exist_ok=False)

 ## STEP 4: Navigate to Kaltura and pull all videos
@ -106,7 +126,7 @@ try:
    while True:
        expand_more = driver.find_element(By.CSS_SELECTOR, ".endless-scroll-more > .btn")
        expand_more.click()
-        time.sleep(3)
+        time.sleep(FETCH_ALL_STALL_TIME)
 except:
    print("Expanded all videos from the listing as possible.")

@ -145,7 +165,7 @@ driver.switch_to.parent_frame()

 # Regex patterns
 re_vid = re.compile(r"\/(scf\/hls)\/p\/(\d+)\/sp\/(\d+)\/serveFlavor\/entryId\/(\w+)\/v\/\d+\/ev\/\d+\/flavorId\/(\w+)\/name\/([\w\.]+)\/seg-(\d+)-[\w\-]+.ts")
-re_str = re.compile(r"\/api_v3\/index.php\/service\/caption_captionAsset\/action\/serve\/captionAssetId\/(\w+)\/ks\/([\w\-]+)\/.srt")
+re_str = re.compile(r"\/api_v3\/index.php\/service\/caption_captionAsset\/action\/serve\/captionAssetId\/(\w+)\/ks\/([\w\-\_]+)\/.srt")

 print("Now processing detailed metadata and download links for all videos.")
 print("This process will take a while to complete.")
@ -171,9 +191,16 @@ def process_gallery_item(gallery_item):
    driver.request_interceptor = read_requests
    driver.get(gallery_item.video_url)

+    wait = WebDriverWait(driver, IMPLICIT_LONG_PERIOD)
+    wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "userLink")))
    gallery_item.author = driver.find_element(By.CLASS_NAME, "userLink").text
+
+    wait = WebDriverWait(driver, IMPLICIT_LONG_PERIOD)
+    wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "#js-entry-create-at > span")))
    gallery_item.date_added = driver.find_element(By.CSS_SELECTOR, "#js-entry-create-at > span").text

+    wait = WebDriverWait(driver, IMPLICIT_LONG_PERIOD)
+    wait.until(EC.element_to_be_clickable((By.ID, "kplayer_ifp")))
    play_frame = driver.find_element(By.ID, "kplayer_ifp")
    driver.switch_to.frame(play_frame)

@ -190,7 +217,7 @@ def process_gallery_item(gallery_item):
        time.sleep(0.1)
        pbar.set_description("Reading network")
    """
-    time.sleep(3)
+    time.sleep(LECTURE_FETCH_PERIOD)

    gallery_item.processed = True
    del driver.request_interceptor
@ -229,10 +256,8 @@ with tqdm(gallery_items, position=0) as pbar:
 driver.close()

 ## STEP 5: Download all the lecture data in parallel
-NUM_PARALLEL = 5
-
 print()
-print("Downloading lecture data in parallel (# of streams: " + str(NUM_PARALLEL) + ").")
+print("Downloading lecture data in parallel (# of streams: " + str(NUM_PARALLEL_THREADS) + ").")
 print("This process will take very long depending on your internet speed,")
 print("but should take less time to finish towards the end.")

@ -244,16 +269,24 @@ def download_lecture(gallery_item):
        if not os.path.exists(dl_path + "/" + fname):
            r = requests.get(dl_list[fname], stream=True)
            if r.status_code == 200:
+                total_length = int(r.headers.get("Content-Length"))
+                with tqdm.wrapattr(r.raw, "read", total=total_length, desc=gallery_item.title + " (" + fname + ")") as raw:
+                    # save the output to a file
+                    with open(dl_path + "/" + fname, 'wb') as output:
+                        shutil.copyfileobj(raw, output)
+                
+                """
                with open(dl_path + "/" + fname, "wb") as f:
                    for chunk in r:
                        f.write(chunk)
+                """
            else:
                print("Encountered unexpected status code", r.status_code, "while downloading", fname)
    
    gallery_item.downloaded = True
    return gallery_item

-next_items = thread_map(download_lecture, gallery_items, max_workers=NUM_PARALLEL)
+next_items = thread_map(download_lecture, gallery_items, max_workers=NUM_PARALLEL_THREADS, position=0)

 print()
 print("Status Report:")