Browse Source

dl-kaltura up

master
kmo-ev 2 years ago
commit
8656879ec2
  1. 164
      .gitignore
  2. 0
      api/__init__.py
  3. 91
      api/auth.py
  4. 261
      dl-kaltura.py

164
.gitignore vendored

@ -0,0 +1,164 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Driver
geckodriver.exe
cookies.pkl
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

0
api/__init__.py

91
api/auth.py

@ -0,0 +1,91 @@
##
## lib/auth.py
## Provides functions for Calnet authentication over
## Selenium, including cookies export.
##
## Copyright (c) 2022 Kevin Mo
##
import pickle
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
COOKIES_NAME = "cookies.pkl"
def save_cookies(driver):
"""
Save cookies given a Selenium driver.
"""
pickle.dump(driver.get_cookies(), open(COOKIES_NAME, "wb"))
return True
def load_cookies(driver):
"""
Load cookies given a Selenium driver.
"""
try:
cookies = pickle.load(open(COOKIES_NAME, "rb"))
for cookie in cookies:
driver.add_cookie(cookie)
except IOError:
pass
return driver
def check_calnet_auth(driver):
"""
Returns whether CalNet authentication is ready.
"""
return "auth.berkeley.edu" in driver.current_url
def perform_calnet_auth(driver, cid, pwd):
"""
Perform authentication with CalNet, given
that driver is at auth.berkeley.edu.
"""
print("Performing Calnet authentication")
print("Current URL:", driver.current_url)
if not check_calnet_auth(driver):
raise Exception("Cannot perform authentication at current state.")
# Target user input and then submit
cid_box = driver.find_element_by_id("username")
pwd_box = driver.find_element_by_id("password")
submit_box = driver.find_element_by_id("submit")
cid_box.send_keys(cid)
pwd_box.send_keys(pwd)
submit_box.click()
# Handle incorrect login attempt
if "auth.berkeley.edu" in driver.current_url:
print("Incorrect login attempt detected.")
raise Exception("Incorrect CalNet credentials")
# Check the presence of Duo 2FA
try:
if "duosecurity.com" not in driver.current_url:
return True
print()
print("IMPORTANT: Complete the 2FA process on the automated browser.")
print("Complete the process through push notification, security key, etc.")
print()
# Stall for success or trust box
wait = WebDriverWait(driver, 30)
trust_box = wait.until(EC.element_to_be_clickable((By.ID, "trust-browser-button")))
print("Detected trust browser button, clicking...")
trust_box.click()
wait = WebDriverWait(driver, 10)
if not wait.until(EC.url_contains("duosecurity.com")):
raise Exception("2FA did not complete successfully")
else:
print("Detected redirect, authentication is a success!")
except:
print("Timed out or could not locate Duo 2FA prompt.")

261
dl-kaltura.py

@ -0,0 +1,261 @@
##
## cli/dl-kaltura.py
## Downloads videos from Kaltura (external tool #78985) with
## Selenium WebDriver informed by the bCourses API.
##
## Starts a Selenium session, authenticates as a Calnet user,
## asks for a course to download from, then downloads all
## found videos to a target location.
##
## Copyright (c) 2022 Kevin Mo
##
## Libraries
import time
import re
import os
import requests
# from selenium import webdriver
from seleniumwire import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from getpass import getpass
import json
from api import auth
from tqdm import tqdm, trange
from tqdm.contrib.concurrent import thread_map
## STEP 0: Common variables
DATA_LOCATION = "D:/Class/Lectures"
## STEP 1: Start session and load cookies
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
driver.implicitly_wait(10)
## STEP 2: Direct to bcourses, then check auth.berkeley.edu
BCOURSES_URL = "https://bcourses.berkeley.edu"
driver.get("https://bcourses.berkeley.edu")
if auth.check_calnet_auth(driver):
# Prompt user for username
uid = input("CalNet ID: ")
pwd = getpass("CalNet password: ")
print("Performing authentication using credentials...")
auth.perform_calnet_auth(driver, uid, pwd)
wait = WebDriverWait(driver, 10)
if not wait.until(EC.url_contains(BCOURSES_URL)):
raise Exception("Could not navigate to bCourses url")
## STEP 3: Navigate to bCourses API and parse JSON
BCOURSES_API = "https://bcourses.berkeley.edu/api/v1/courses"
page_num = 1
selected_class = None
while True:
driver.get(BCOURSES_API + "?page=" + str(page_num))
json_content = driver.find_element(By.ID, "json").text
all_classes = json.loads(json_content)
if not isinstance(all_classes, list):
print("Received unexpected JSON response from bCourses API.")
print()
print("Select a class to download from:")
for i, bclass in enumerate(all_classes):
if "name" not in bclass:
bclass["name"] = "Unknown class"
print(str(i+1) + ") " + bclass["name"] + " (class ID #" + str(bclass["id"]) + ")")
class_option = input("Select an numbered option ('b' for last page, 'n' for next page): ")
if class_option == 'b':
page_num = max(1, page_num-1)
elif class_option == 'n':
page_num = page_num + 1
else:
try:
class_idx = int(class_option)
selected_class = all_classes[class_idx-1]
break
except:
print("Invalid input or numbered option!")
if selected_class is None:
raise Exception("No class ID found.")
print("\nSelected " + selected_class["name"])
## STEP 3.5: Initialize directory for video transfer
final_dir = DATA_LOCATION + "/" + re.sub(r'[^\w\s\(\)]', '', selected_class["name"])
os.makedirs(final_dir, exist_ok=False)
## STEP 4: Navigate to Kaltura and pull all videos
driver.get(BCOURSES_URL + "/courses/" + str(selected_class["id"]) + "/external_tools/78985")
kaltura_frame = driver.find_element(By.ID, "tool_content")
driver.switch_to.frame(kaltura_frame)
# Fetch more gallery items by clicking on
# the more button
print("Fetching all videos from Kaltura library...")
try:
while True:
expand_more = driver.find_element(By.CSS_SELECTOR, ".endless-scroll-more > .btn")
expand_more.click()
time.sleep(3)
except:
print("Expanded all videos from the listing as possible.")
gallery_elems = driver.find_elements(By.CLASS_NAME, "galleryItem")
num_elems = len(gallery_elems)
class GalleryItem:
def __init__(self, elem, index=-1):
self.index = index
self.title = elem.find_element(By.CLASS_NAME, "thumb_name_content").text
self.author = elem.find_element(By.CLASS_NAME, "userLink").text
self.date_added = elem.find_element(By.CSS_SELECTOR, ".thumbTimeAdded > span > span").text
self.thumbnail = elem.find_element(By.CLASS_NAME, "thumb_img").get_attribute("src")
self.video_url = elem.find_element(By.CLASS_NAME, "item_link").get_attribute("href")
self.download_urls = {}
self.srt_urls = {}
self.download_path = None
self.processed = False
self.downloaded = False
def __str__(self):
return "(#" + self.str_index() + ") " + self.title + " - " + self.author
def get_folder_name(self):
return re.sub(r'[^\w\s\(\)]', '', str(self))
def str_index(self):
if self.index < 0:
return "UNK"
return f'{self.index:03}'
gallery_items = [GalleryItem(g, index=num_elems-i) for i, g in enumerate(gallery_elems)]
print("This tool will download", len(gallery_items), "videos from the Kaltura gallery.\n")
driver.switch_to.parent_frame()
# Regex patterns
re_vid = re.compile(r"\/(scf\/hls)\/p\/(\d+)\/sp\/(\d+)\/serveFlavor\/entryId\/(\w+)\/v\/\d+\/ev\/\d+\/flavorId\/(\w+)\/name\/([\w\.]+)\/seg-(\d+)-[\w\-]+.ts")
re_str = re.compile(r"\/api_v3\/index.php\/service\/caption_captionAsset\/action\/serve\/captionAssetId\/(\w+)\/ks\/([\w\-]+)\/.srt")
print("Now processing detailed metadata and download links for all videos.")
print("This process will take a while to complete.")
def process_gallery_item(gallery_item):
"""
Gather full information for each video and
find video + srt links from browser requests.
"""
# Read in requests
def read_requests(request):
if request.host == "cfvod.kaltura.com":
vid_match = re_vid.match(request.path)
srt_match = re_str.match(request.path)
if vid_match:
gallery_item.download_urls[vid_match.group(4) + '.mp4'] = request.url.replace("/scf/hls/", "/pd/")
elif srt_match:
gallery_item.srt_urls[srt_match.group(1) + '.srt'] = request.url
# Reset all requests to proxy
del driver.requests
driver.request_interceptor = read_requests
driver.get(gallery_item.video_url)
gallery_item.author = driver.find_element(By.CLASS_NAME, "userLink").text
gallery_item.date_added = driver.find_element(By.CSS_SELECTOR, "#js-entry-create-at > span").text
play_frame = driver.find_element(By.ID, "kplayer_ifp")
driver.switch_to.frame(play_frame)
play_button = driver.find_element(By.CLASS_NAME, "largePlayBtn")
play_button.click()
# Open quality options
settings_button = driver.find_element(By.CSS_SELECTOR, ".sourceSelector > button")
settings_button.click()
"""
pbar = trange(50, position=1)
for _ in pbar:
time.sleep(0.1)
pbar.set_description("Reading network")
"""
time.sleep(3)
gallery_item.processed = True
del driver.request_interceptor
def print_gallery_item(gallery_item):
print(str(gallery_item))
print(gallery_item.date_added)
print(gallery_item.thumbnail)
print(gallery_item.video_url)
print(gallery_item.download_urls)
print(gallery_item.srt_urls)
with tqdm(gallery_items, position=0) as pbar:
for gallery_item in pbar:
pbar.set_description("Processing '" + gallery_item.title + "'")
process_gallery_item(gallery_item)
print()
print("All links have finished pre-processing.")
print()
print("Creating the required folders for each lecture and")
print("saving pre-processed metadata...")
with tqdm(gallery_items, position=0) as pbar:
for gallery_item in pbar:
pbar.set_description("Allocating '" + gallery_item.title + "'")
# Create the subfolder
dl_path = final_dir + "/" + gallery_item.get_folder_name()
os.makedirs(dl_path)
gallery_item.download_path = dl_path
with open(dl_path + "/download.json", 'w') as f:
f.write(json.dumps(gallery_item.__dict__, indent=4))
# Close the driver connection
driver.close()
## STEP 5: Download all the lecture data in parallel
NUM_PARALLEL = 5
print()
print("Downloading lecture data in parallel (# of streams: " + str(NUM_PARALLEL) + ").")
print("This process will take very long depending on your internet speed,")
print("but should take less time to finish towards the end.")
def download_lecture(gallery_item):
dl_path = gallery_item.download_path
# Download all subtitles
dl_list = {**gallery_item.download_urls, **gallery_item.srt_urls}
for fname in dl_list:
if not os.path.exists(dl_path + "/" + fname):
r = requests.get(dl_list[fname], stream=True)
if r.status_code == 200:
with open(dl_path + "/" + fname, "wb") as f:
for chunk in r:
f.write(chunk)
else:
print("Encountered unexpected status code", r.status_code, "while downloading", fname)
gallery_item.downloaded = True
return gallery_item
next_items = thread_map(download_lecture, gallery_items, max_workers=NUM_PARALLEL)
print()
print("Status Report:")
print("Total videos downloaded", sum([i.downloaded for i in next_items]), "out of", len(next_items))
print("Tool has finished execution.")
Loading…
Cancel
Save