Browse Source

improve UX

master
kmo-ev 2 years ago
parent
commit
2997c7c2d5
  1. 53
      dl-kaltura.py

53
dl-kaltura.py

@ -11,9 +11,11 @@
## ##
## Libraries ## Libraries
import shutil
import time import time
import re import re
import os import os
import sys
import requests import requests
# from selenium import webdriver # from selenium import webdriver
from seleniumwire import webdriver from seleniumwire import webdriver
@ -29,12 +31,22 @@ from tqdm.contrib.concurrent import thread_map
## STEP 0: Common variables ## STEP 0: Common variables
DATA_LOCATION = "D:/Class/Lectures" DATA_LOCATION = "D:/Class/Lectures"
IMPLICIT_WAIT_PERIOD = 10 # how long the driver should wait to find elements before quitting (general)
IMPLICIT_LONG_PERIOD = 30 # how long the driver should wait to find elements before quitting (ajax/spa elements)
LECTURE_FETCH_PERIOD = 10 # how long the driver should stream lectures to fetch all download links/resources
FETCH_ALL_STALL_TIME = 3 # how long the driver should stall before expanding more through the expand button
NUM_PARALLEL_THREADS = 5 # how many parallel threads to download lecture resources
DRIVER_HEADLESS_MODE = True # whether to run the driver in headless mode (browser does not show up)
DRIVER_AUDIO_DISABLE = True # whether to disable browser audio on the driver
## STEP 1: Start session and load cookies ## STEP 1: Start session and load cookies
options = Options() options = Options()
options.add_argument("--headless") if DRIVER_HEADLESS_MODE:
options.add_argument("--headless")
if DRIVER_AUDIO_DISABLE:
options.set_preference("media.volume_scale", "0.0")
driver = webdriver.Firefox(options=options) driver = webdriver.Firefox(options=options)
driver.implicitly_wait(10) driver.implicitly_wait(IMPLICIT_WAIT_PERIOD)
## STEP 2: Direct to bcourses, then check auth.berkeley.edu ## STEP 2: Direct to bcourses, then check auth.berkeley.edu
BCOURSES_URL = "https://bcourses.berkeley.edu" BCOURSES_URL = "https://bcourses.berkeley.edu"
@ -46,7 +58,7 @@ if auth.check_calnet_auth(driver):
print("Performing authentication using credentials...") print("Performing authentication using credentials...")
auth.perform_calnet_auth(driver, uid, pwd) auth.perform_calnet_auth(driver, uid, pwd)
wait = WebDriverWait(driver, 10) wait = WebDriverWait(driver, IMPLICIT_WAIT_PERIOD)
if not wait.until(EC.url_contains(BCOURSES_URL)): if not wait.until(EC.url_contains(BCOURSES_URL)):
raise Exception("Could not navigate to bCourses url") raise Exception("Could not navigate to bCourses url")
@ -92,6 +104,14 @@ print("\nSelected " + selected_class["name"])
## STEP 3.5: Initialize directory for video transfer ## STEP 3.5: Initialize directory for video transfer
final_dir = DATA_LOCATION + "/" + re.sub(r'[^\w\s\(\)]', '', selected_class["name"]) final_dir = DATA_LOCATION + "/" + re.sub(r'[^\w\s\(\)]', '', selected_class["name"])
if os.path.isdir(final_dir):
print("Folder already detected for this class in data directory.")
confirm_del = input("Delete this directory to proceed? (y/N): ")
if confirm_del.lower() != "y":
sys.exit(1)
print("Deleting class directory...")
shutil.rmtree(final_dir)
os.makedirs(final_dir, exist_ok=False) os.makedirs(final_dir, exist_ok=False)
## STEP 4: Navigate to Kaltura and pull all videos ## STEP 4: Navigate to Kaltura and pull all videos
@ -106,7 +126,7 @@ try:
while True: while True:
expand_more = driver.find_element(By.CSS_SELECTOR, ".endless-scroll-more > .btn") expand_more = driver.find_element(By.CSS_SELECTOR, ".endless-scroll-more > .btn")
expand_more.click() expand_more.click()
time.sleep(3) time.sleep(FETCH_ALL_STALL_TIME)
except: except:
print("Expanded all videos from the listing as possible.") print("Expanded all videos from the listing as possible.")
@ -145,7 +165,7 @@ driver.switch_to.parent_frame()
# Regex patterns # Regex patterns
re_vid = re.compile(r"\/(scf\/hls)\/p\/(\d+)\/sp\/(\d+)\/serveFlavor\/entryId\/(\w+)\/v\/\d+\/ev\/\d+\/flavorId\/(\w+)\/name\/([\w\.]+)\/seg-(\d+)-[\w\-]+.ts") re_vid = re.compile(r"\/(scf\/hls)\/p\/(\d+)\/sp\/(\d+)\/serveFlavor\/entryId\/(\w+)\/v\/\d+\/ev\/\d+\/flavorId\/(\w+)\/name\/([\w\.]+)\/seg-(\d+)-[\w\-]+.ts")
re_str = re.compile(r"\/api_v3\/index.php\/service\/caption_captionAsset\/action\/serve\/captionAssetId\/(\w+)\/ks\/([\w\-]+)\/.srt") re_str = re.compile(r"\/api_v3\/index.php\/service\/caption_captionAsset\/action\/serve\/captionAssetId\/(\w+)\/ks\/([\w\-\_]+)\/.srt")
print("Now processing detailed metadata and download links for all videos.") print("Now processing detailed metadata and download links for all videos.")
print("This process will take a while to complete.") print("This process will take a while to complete.")
@ -171,9 +191,16 @@ def process_gallery_item(gallery_item):
driver.request_interceptor = read_requests driver.request_interceptor = read_requests
driver.get(gallery_item.video_url) driver.get(gallery_item.video_url)
wait = WebDriverWait(driver, IMPLICIT_LONG_PERIOD)
wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "userLink")))
gallery_item.author = driver.find_element(By.CLASS_NAME, "userLink").text gallery_item.author = driver.find_element(By.CLASS_NAME, "userLink").text
wait = WebDriverWait(driver, IMPLICIT_LONG_PERIOD)
wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "#js-entry-create-at > span")))
gallery_item.date_added = driver.find_element(By.CSS_SELECTOR, "#js-entry-create-at > span").text gallery_item.date_added = driver.find_element(By.CSS_SELECTOR, "#js-entry-create-at > span").text
wait = WebDriverWait(driver, IMPLICIT_LONG_PERIOD)
wait.until(EC.element_to_be_clickable((By.ID, "kplayer_ifp")))
play_frame = driver.find_element(By.ID, "kplayer_ifp") play_frame = driver.find_element(By.ID, "kplayer_ifp")
driver.switch_to.frame(play_frame) driver.switch_to.frame(play_frame)
@ -190,7 +217,7 @@ def process_gallery_item(gallery_item):
time.sleep(0.1) time.sleep(0.1)
pbar.set_description("Reading network") pbar.set_description("Reading network")
""" """
time.sleep(3) time.sleep(LECTURE_FETCH_PERIOD)
gallery_item.processed = True gallery_item.processed = True
del driver.request_interceptor del driver.request_interceptor
@ -229,10 +256,8 @@ with tqdm(gallery_items, position=0) as pbar:
driver.close() driver.close()
## STEP 5: Download all the lecture data in parallel ## STEP 5: Download all the lecture data in parallel
NUM_PARALLEL = 5
print() print()
print("Downloading lecture data in parallel (# of streams: " + str(NUM_PARALLEL) + ").") print("Downloading lecture data in parallel (# of streams: " + str(NUM_PARALLEL_THREADS) + ").")
print("This process will take very long depending on your internet speed,") print("This process will take very long depending on your internet speed,")
print("but should take less time to finish towards the end.") print("but should take less time to finish towards the end.")
@ -244,16 +269,24 @@ def download_lecture(gallery_item):
if not os.path.exists(dl_path + "/" + fname): if not os.path.exists(dl_path + "/" + fname):
r = requests.get(dl_list[fname], stream=True) r = requests.get(dl_list[fname], stream=True)
if r.status_code == 200: if r.status_code == 200:
total_length = int(r.headers.get("Content-Length"))
with tqdm.wrapattr(r.raw, "read", total=total_length, desc=gallery_item.title + " (" + fname + ")") as raw:
# save the output to a file
with open(dl_path + "/" + fname, 'wb') as output:
shutil.copyfileobj(raw, output)
"""
with open(dl_path + "/" + fname, "wb") as f: with open(dl_path + "/" + fname, "wb") as f:
for chunk in r: for chunk in r:
f.write(chunk) f.write(chunk)
"""
else: else:
print("Encountered unexpected status code", r.status_code, "while downloading", fname) print("Encountered unexpected status code", r.status_code, "while downloading", fname)
gallery_item.downloaded = True gallery_item.downloaded = True
return gallery_item return gallery_item
next_items = thread_map(download_lecture, gallery_items, max_workers=NUM_PARALLEL) next_items = thread_map(download_lecture, gallery_items, max_workers=NUM_PARALLEL_THREADS, position=0)
print() print()
print("Status Report:") print("Status Report:")

Loading…
Cancel
Save