之前用request爬取猫眼电影时,
由于猫眼把电影信息变成了动态获取,因此无法直接用源代码爬取。
现在通过selenium直接点进电影信息时发现,
即使首页面加了防检测,但只能在当前页面生效,打开的新页面window.navigator.wevdrive===True,依旧无法获取电影信息,因此仍无法用selenium获取多条电影信息。
因此通过request爬取电影目录,selenium爬取电影信息,总算是成功爬取了电影信息,目前猫眼评分转了码问题仍未解决
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from time import sleep
import requests
from fake_useragent import UserAgent
from lxml import etree
from queue import Queue
from threading import Thread
def get_film_list(page):
film_list = []
for i in range(page):
url = f'https://www.maoyan.com/films/?showType=3&offset={30*i}'
headers = {'User-Agent': UserAgent().chrome}
resp = requests.get(url, headers = headers)
html = etree.HTML(resp.text)
films = html.xpath('//dd/div[@title]/a/@href')
for film in films:
film_list.append(film)
sleep(1)
return film_list
def get_film_data(film):
url = f'https://www.maoyan.com{film}'
options = webdriver.ChromeOptions()
# 设置无头
options.add_argument('--headless')
# 防检测1
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
service = Service(executable_path='./tools/chromedriver')
chrome = webdriver.Chrome(service=service, options=options)
chrome.implicitly_wait(2)
# 防检测2
chrome.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => false
})
"""
})
chrome.get(url)
name = chrome.find_element(By.XPATH, '//h1[@class="name"]').text
type_ = [i.text for i in chrome.find_elements(By.XPATH, '//a[@class="text-link"]')]
chrome.find_element(By.XPATH, '//div[@class="tab-title "]').click()
actors = []
for i in chrome.find_elements(By.XPATH, '//li[@class="celebrity actor"]/div/a'):
if i.text and i.text not in actors:
actors.append(i.text.strip())
info = {'电影名': name, '类型': type_, '主演': actors}
return info
def create_quere(list):
q = Queue()
for i in list:
q.put(i)
return q
class MyThread(Thread):
def __init__(self, q):
Thread.__init__(self)
self.__q = q
def run(self) -> None:
while not self.__q.empty():
film = self.__q.get()
film_info = get_film_data(film)
print(film_info)
if __name__ == '__main__':
film_list = get_film_list(1)
q = create_quere(film_list)
for i in range(3):
t = MyThread(q)
t.start()