import cookielib import getpass import sys import urllib import urllib2 import HTMLParser import time def get_page(urlOpener, url, data = None): request = urllib2.Request(url) url = None retries = 10 while True: try: url = urlOpener.open(request, data) except Exception: if retries == 0: raise Exception("Failed to fetch url %s" % (url)) else: retries -= 1 time.sleep(10) else: break page = url.read(1024 * 1024) return page def alist_to_dict(alist): dict = {} for a in alist: dict[a[0]] = a[1] return dict class WebLoginRedirectHTMLParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self.redirect = "" def handle_starttag(self, tag, attrs): if tag == "a": d = alist_to_dict(attrs) if "id" in d and d["id"] == "continue_link": self.redirect = d["href"] def get_redirect(self): return self.redirect class WebLoginHTMLParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self.crud = { "RT" : "", "ST" : "", "LC" : "" } def handle_starttag(self, tag, attrs): if tag == "input": d = alist_to_dict(attrs) if d["type"] == "hidden": if d["name"] in self.crud: self.crud[d["name"]] = d["value"] def get_hiddens(self): return self.crud class HTMLCrudParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self.title = "" self.on_title = False self.urls = [] self.inputs = [] def handle_starttag(self, tag, attrs): if tag == "a": d = alist_to_dict(attrs) for key in d: if key.lower() == "href": self.urls.append(d[key]) break elif tag == "input": d = alist_to_dict(attrs) for key in d: if key.lower() == "id": self.inputs.append(d) break elif tag == "title": self.on_title = True def handle_endtag(self, tag): if tag == "title": self.on_title = False def handle_data(self, data): if self.on_title: self.title += data.strip() def get_urls(self): return self.urls def get_inputs(self): return self.inputs def get_title(self): return self.title class Course(): def __init__(self, name, number): self.name = name self.number = number self.lectures = [] def get_name(self): return self.name def get_number(self): return self.number def get_lectures(self): return self.lectures def add_lecture(self, lecture): self.lectures.append(lecture) print >> sys.stderr, "--> SUNET authentication required:" print >> sys.stderr, "---> Login: ", login = sys.stdin.readline().strip() print >> sys.stderr, "\r", password = getpass.getpass("---> Password: ", sys.stderr) print >> sys.stderr, "\r", # set up cookies cookies = cookielib.CookieJar() urlOpener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies)) # request myvideosu, which redirects to stanford weblogin page = get_page(urlOpener, "https://myvideosu.stanford.edu") webloginparser = WebLoginHTMLParser() webloginparser.feed(page) hiddens = webloginparser.get_hiddens() # issue username / password to stanford weblogin; we'll be re-directed to SCPD's page data = urllib.urlencode({ "username" : login, "password" : password, "login" : "yes", "RT" : hiddens["RT"], "ST" : hiddens["ST"], "LC" : hiddens["LC"] }) page = get_page(urlOpener, "https://weblogin.stanford.edu/login/login", data) if not 'webauth_wpt_krb5' in [cookie.name for cookie in cookies]: print >> sys.stderr, "--> Login failed." sys.exit(1) print >> sys.stderr, "--> Logged in." # parse the redirect redirparser = WebLoginRedirectHTMLParser() redirparser.feed(page) redirect = redirparser.get_redirect() # parse SCPD's main page page = get_page(urlOpener, redirect) crud = HTMLCrudParser() crud.feed(page) print >> sys.stderr, "--> Loading Quarter \"%s\"" % (crud.get_title()) courses = [] # for each course, load its page to get the lectures # for each page of lectures, load each individual meeting to grab the mms url # meetings are in reverse-chronological order print >> sys.stderr, "--> Loading courses..." for c in crud.get_urls(): if c.find("GradCourseInfo.aspx") == -1: continue url = "https://myvideosu.stanford.edu" + c page = get_page(urlOpener, url) crud2 = HTMLCrudParser() crud2.feed(page) title = crud2.get_title() number = title[0:title.find(" ")].strip() name = title[title.find(" "):].strip() print >> sys.stderr, "---> %s (%s)" % (number, name) c = Course(name, number) meetings = int(page.split("Course Meetings: (")[1].split(")")[0]) novideos = page.count("No video") for l in crud2.get_urls(): if l.find("openslplayer.aspx") != -1 and l.find("wmp=true") != -1: u = l.split("'")[1] page = get_page(urlOpener, u) crud3 = HTMLCrudParser() crud3.feed(page) ins = crud3.get_inputs() mms = None for i in ins: if i["id"] == "video_URL": mms = i["value"] if mms == None: print >> sys.stderr, "---> Error: missing mms url for %s" % (u) c.add_lecture(mms) print >> sys.stderr, "----> %d lectures" % (len(c.get_lectures())) if (meetings - novideos) != len(c.get_lectures()): print >> sys.stderr, "-----> WARNING: MAY NOT HAVE OBTAINED ALL LECTURES! (page says %d, found %d, no videos %d)" % (meetings, len(c.get_lectures()), novideos) courses.append(c) print >> sys.stderr, "--> Dumping courses..." for c in courses: number = c.get_number() # reverse-chronological order i = 0 lectures = c.get_lectures() lectures.reverse() for l in lectures: print "%s\t%d\t%s" % (number, i, l) i += 1