生活随笔
收集整理的這篇文章主要介紹了
Python批量下载MOOC课件
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
今天干了件有點快樂的事情——批量下載MOOC課件
代碼搬運,需做以下修改
1.pip install selenium
2.chormedriver下載7.2可
2.修改courseware_url
from selenium
import webdriver
from selenium
.webdriver
.common
.by
import By
from selenium
.webdriver
.support
.ui
import WebDriverWait
from selenium
.webdriver
.support
import expected_conditions
as EC
from selenium
.webdriver
.common
.keys
import Keys
from selenium
.webdriver
.common
.action_chains
import ActionChains
from selenium
.common
.exceptions
import *
import requests
from requests
.packages
.urllib3
.exceptions
import InsecureRequestWarningrequests
.packages
.urllib3
.disable_warnings
(InsecureRequestWarning
)
requests
.adapters
.DEFAULT_RETRIES
= 5
import time
import os
import redriver
= webdriver
.Chrome
()
wait
= WebDriverWait
(driver
, 10)def download(url
, file_name
):headers
= {'Host': 'hubble.netease.com','Origin': 'https://www.icourse163.org','Referer': url
.split
("#")[0],'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'}if not os
.path
.exists
(file_name
) or os
.path
.getsize
(file_name
) <= 10:with open(file_name
, "wb") as f
:r
= requests
.get
(url
, headers
=headers
, verify
=False)f
.write
(r
.content
)f
.close
()print("\t下載成功:{}".format(file_name
))else:print("\t文件已存在:{}".format(file_name
))
def get_courseware(courseware_url
, path
, c_range
=[0, 0]):t
= 0while t
< 2:try:driver
.get
(courseware_url
)h3
= wait
.until
(EC
.element_to_be_clickable
((By
.CSS_SELECTOR
, "#g-body > div.m-learnhead > div > div > div > a.f-fl > h4")))school_name
= re
.findall
(r'/([a-zA-Z]+)-', courseware_url
)[0]title
= h3
.textpath_1
= os
.path
.join
(path
, title
+ "_" + school_name
)if not os
.path
.exists
(path_1
):os
.makedirs
(path_1
)path
= os
.path
.join
(path_1
, "courseware")if not os
.path
.exists
(path
):os
.makedirs
(path
)h3_count
= len(driver
.find_elements_by_css_selector
("div > div.m-learnChapterList> div.m-learnChapterNormal > div.titleBox > h3"))if c_range
[1] == 0:c_range2
= h3_count
else:c_range2
= c_range
[1]for index
in range(3 + c_range
[0], 3 + c_range2
):driver
.refresh
()h3
= wait
.until
(EC
.element_to_be_clickable
((By
.CSS_SELECTOR
,"div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child(3) > div.titleBox > h3")))h3
.click
()h3
= wait
.until
(EC
.element_to_be_clickable
((By
.CSS_SELECTOR
,"div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child({}) > div.titleBox > h3".format(index
))))h3_text
= h3
.text
print("{}:".format(h3_text
), end
="\t")patten
= re
.compile('.*?第(.{1,3})(周|章).*?')match
= re
.match
(patten
, h3_text
)if match
:week
= match
.group
(0)else:week
= h3_texth3
.click
()time
.sleep
(3)file_count
= len(driver
.find_elements_by_xpath
('//div[@class="sourceList"]/*[@title="文檔講稿"]'))print(file_count
)h4_count
= len(driver
.find_elements_by_css_selector
('div.u-learnLesson > h4'))for h4_index
in range(1, h4_count
+ 1):h4
= wait
.until
(EC
.element_to_be_clickable
((By
.CSS_SELECTOR
, 'div.u-learnLesson:nth-of-type({}) > h4.j-name'.format(h4_index
))))h4str
= h4
.textfile_count
= len(driver
.find_elements_by_css_selector
(f'div.u-learnLesson:nth-of-type({h4_index}) > div.sourceList > div[title^="文檔"]'))for f_index
in range(1, file_count
+ 1):title
= wait
.until
(EC
.element_to_be_clickable
((By
.CSS_SELECTOR
,f'div.u-learnLesson:nth-of-type({h4_index}) > div.sourceList > div[title^="文檔"]')))titlestr
= title
.get_attribute
("title")title
.click
()time
.sleep
(0.2)download_btn
= wait
.until
(EC
.element_to_be_clickable
((By
.PARTIAL_LINK_TEXT
, '文檔下載')))download_url
= download_btn
.get_attribute
("href")week
= week
.replace
(":", "-").replace
("/", " ").replace
("\\", " ").replace
("課件:", " ").replace
(":", " ")titlestr
= f'{h4str} {titlestr}'title
= titlestr
.replace
(":", "-").replace
("/", " ").replace
("\\", " ").replace
("課件:"," ").replace
(":", " ").replace
("/", " ")print(week
, " ", title
)file_name
= path
+ "\\" + week
+ " " + "".join
(title
.split
()).replace
(":", " ") + "." + \download_url
.split
(".")[-1].split
('&')[0]print(file_name
)download
(download_url
, file_name
)driver
.back
()time
.sleep
(1)h3
= wait
.until
(EC
.element_to_be_clickable
((By
.CSS_SELECTOR
,"div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child(3) > div.titleBox > h3")))h3
.click
()h3
= wait
.until
(EC
.element_to_be_clickable
((By
.CSS_SELECTOR
,"div > div.m-learnChapterList> div.m-learnChapterNormal:nth-child({}) > div.titleBox > h3".format(index
))))h3
.click
()t
= 5except FileNotFoundError
:print("FileNotFoundError: [Errno 2] No such file or directory: ")t
+= 1def main():courseware_url
= 'https://www.icourse163.org/learn/XDU-1001638014?tid=1462808447#/learn/content'path
= r"D:\大二下\信號與系統\中國大學MOOC"get_courseware
(courseware_url
, path
, [0, 0])driver
.quit
() if __name__
== '__main__':main
()
question:
1.每節只能下一個課件,應該有多少下多少
2.命名格式調一下更好,放假了來改進
總結
以上是生活随笔為你收集整理的Python批量下载MOOC课件的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。