中国大学Mooc平台,自动下载pdf文档
生活随笔
收集整理的這篇文章主要介紹了
中国大学Mooc平台,自动下载pdf文档
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
寫在前面的話
基于多線程的下載pdf文檔的腳本,視頻也差不多都是這個意思,自己去提取信息即可,我覺得老師不一定講的好(講得好不好我覺得這很大程度主觀因素,而且還與學習者自身水平相關),但是至少是很多是國家精品ppt吧,還是非常值得去看看的!
實現就一個requests第三方庫,需要手動復制cookie和tid兩個字段信息,其他的沒有啥特殊地方!
import requests import re import threading import time import functoolsreq = requests.session() headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36','content-type': 'text/plain' }content_id = [] _id = [] pdf_url = [] threads = []def request_mooc_get(url):return req.get(url=url)def request_mooc_post(url, data):return req.post(headers=headers, cookies=cookies, url=url, data=data)def get_timestamp():"""獲取13位置時間戳:return:"""return int(round(time.time() * 1000))def get_session_id(cookie: str):# NTESSTUDYSI=06f09fb95c634665aeeac99799a919fa;result = re.findall(r'NTESSTUDYSI=(\S+)[;]', cookie)if len(result) == 0:raise Exception('cookie error!')return result[0]# url中tid=1206878228 def get_course_id():return str(_tid)def set_course_id(tid):global _tid_tid = tiddef get_cookie():return cookies['cookie']def set_cookie(cookie):global cookiescookies = dict(cookie=cookie)def set_pdf_url():url = 'https://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr'length = len(content_id)print(length)for i in range(length):try:print(content_id[i] + " " + _id[i])except IndexError:breakdata = {'callCount': 1,'scriptSessionId': '${scriptSessionId}190','httpSessionId': get_session_id(cookies['cookie']),'c0-scriptName': 'CourseBean','c0-methodName': 'getLessonUnitLearnVo','c0-id': 0,'c0-param0': 'number:' + content_id[i], # content_id'c0-param1': 'number:3','c0-param2': 'number:0','c0-param3': 'number:' + _id[i], # section_id 1245454394'batchId': get_timestamp()}# print(data)res = req.post(url=url, cookies=cookies, data=data, headers=headers)con = res.textr = re.findall(r'textOrigUrl:"(.*)\.pdf"', con)# print(r)if len(r) == 0:continuepdf_url.append(r[0]+'.pdf')from urllib.parse import quote, unquoteprint('get ' + get_file_name(unquote(r[0])))# pdf_url.append(r[0])# 1211971774 id # def get_course_info():url = 'https://www.icourse163.org/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr'data = {"callCount": "1","scriptSessionId": "${scriptSessionId}190","httpSessionId": get_session_id(cookies['cookie']),"c0-scriptName": "CourseBean","c0-methodName": "getLastLearnedMocTermDto","c0-id": "0","c0-param0": "number:" + get_course_id(),"batchId": get_timestamp()}r = request_mooc_post(url=url, data=data)# print(r)return rdef set_content_id(content: str):result = re.findall(r'contentId=(\d+);', content)print(result)global content_idcontent_id = resultdef set_id(context: str):result = re.findall(r'id=(\d+);s.*jsonContent', context)print(result)global _id_id = resultdef _time(func):@functools.wraps(func)def wrapper(*args, **kwargs):start = time.time()func(*args, **kwargs)end = time.time()print('Took ' + str(end - start))return wrapper@_time def start_download():for url in pdf_url:# r = get_file_name(visual_url)# print(r)t = threading.Thread(target=start_write, args=(url,))t.start()threads.append(t)for t in threads:t.join()print("ok!")def start_write(url):from urllib.parse import unquoteresult = request_mooc_get(url)visual_url = unquote(url)file_name = get_file_name(visual_url)print(file_name + ' downloading...')with open(file_name, 'wb') as f:# result.iter_content()# f.write(result.content)for chunk in result.iter_content(1024):f.write(chunk)def get_file_name(url):result = re.findall(r'&download=(.*)', url)return result[0]def main(tid, cookie):set_course_id(tid)set_cookie(cookie)c = get_course_info()context = c.textset_content_id(context)set_id(context)set_pdf_url()# print(pdf_url)print(len(pdf_url))start_download()if __name__ == '__main__':"""tid在url中直接復制即可cookie直接F12復制請求頭里的cookie即可,復制完全,替換下面的cookie"""tid = ''cookie = ''main(tid, cookie)最后上一張圖
總結
以上是生活随笔為你收集整理的中国大学Mooc平台,自动下载pdf文档的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: VS建lUML画类图
- 下一篇: 内核链表list.h文件剖析