有效沟通问答-【官方】百战程序员_IT在线教育培训机构

import requests,re
import json
from requests.api import head
url ="http://api-my.le.com/vcm/api/list?jsonp=jQuery19108692358134849103_1628923285688&type=video&rows=20&page=1&sort=&cid=2&source=1&xid=74513092&pid=10058550&ctype=cmt%2Cimg%2Cvote&listType=1&_=1628923285700"
ua={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
html = requests.get(url,headers=ua).text
print("-"*500)
contents = re.findall(r'"content":"(.*?)"',html)
print(contents,len(contents),sep="\n")
new_contens=[content.encode('utf-8').decode('unicode-escape') for content in contents]
print(new_contens)

老师，为什么unicode字符串不能全部转成中文？查了半天，还是还没搞出来？

怎么处理，麻烦不吝赐教。谢谢。

Python 全系列/第十六阶段：Python 爬虫开发/爬虫反反爬- 631楼

# _*_coding=utf-8 _*_
from fake_useragent import UserAgent
import requests
from lxml import etree
from time import sleep

def get_html(url):
    """
    :param url: 要爬取的url
    :return返回html
    """
    headers = {
        "User-Agent": UserAgent().chrome
    }
    resp = requests.get(url, headers=headers)
    sleep(3)
    if resp.status_code == 200:
        resp.encoding = 'utf-8'
        return resp.text
    else:
        return None

def parse_list(html):
    """
    :param html: 传递进来一个有电影列表的html
    :return 返回一个电影列表的url
    """
    e = etree.HTML(html)
    list_url = ['https://maoyan.com'+ url for url in e.xpath('//div[@class="movie-item-hover"]/a/@href')]
    return list_url

def parse_index(html):
    """
    :param html: 传递进来一个有电影信息的url
    :return  已经提取好的电影信息
    """
    e = etree.HTML(html)
    names = e.xpath('//h1/text()')[0]
    type = e.xpath('//li[@class="ellipsis"]/a/text()')[0]
    actor = e.xpath('//ul[@class="celebrity-list clearfix"]/li[@class="celebrity actor"]/div/a/text()')
    actors = format_actor(actor)
    return {'name': names, 'type': type, 'actor': actors}

def format_actor(actors):
    actor_set = set()  # 去重
    for actor in actors:
        actor_set.add(actor.strip())
    return actor_set

def main():
    num = int(input('请输入要获取多少页数据'))
    for y in range(num):
        url = 'https://maoyan.com/films?showType=3&offset={}'.format(y*30)
        # print(url)
        list_html = get_html(url)
        list_url = parse_list(list_html)
        for url in list_url:
            # print(url)
            info_html = get_html(url)
            movie = parse_index(info_html)
            print(movie)


if __name__ == '__main__':
    main()

老师为啥没有数据啊！

Python 全系列/第十六阶段：Python 爬虫开发/爬虫反反爬- 632楼

from urllib.request import Request,build_opener,urlopen
from fake_useragent import UserAgent
from urllib.parse import urlencode
from urllib.request import HTTPCookieProcessor

login_url="https://www.mypianku.net/user/login/"
form_data={
    "user":"18628906572",
    "password":"x..k.d$2000$"
}
headers={
    "User-Agent":UserAgent().random}
req=Request(login_url,headers=headers,data=urlencode(form_data).encode())
opener=build_opener()
resp=opener.open(req)


#---------------------------登录成功--------------------


url="https://www.mypianku.net/user/account/"

headers={
    "User-Agent":UserAgent().random,
    "Cookie":"_pk_id.1.f469=6d7b76987328fd10.1626832666.; d_c=d_1412928085; vrg_sc=4bb1ca6010ff37986d716442a23afa73; vrg_go=1; pyad=2; player123=%E6%B2%90%E6%B5%B4%E4%B9%8B%E7%8E%8BHD%23%23%2Fpy%2FlNmZxwmZsBTZ_1.html%23%235812%23%2394%24%24%E4%B8%80%E8%B7%AF%E6%83%8A%E5%96%9C%E6%AD%A3%E7%89%87%23%23%2Fpy%2FlNmZoRWMp1WM_1.html%23%23271%23%234%24%24%E4%B8%80%E8%B7%AF%E6%83%8A%E5%96%9C%E9%AB%98%E6%B8%85%23%23%2Fpy%2FlNGbrBTZ2wmY_1.html%23%23477%23%238%24%24%E6%B3%95%E5%8C%BB%E7%A7%A6%E6%98%8E2%E6%B8%85%E9%81%93%E5%A4%AB%E7%AC%AC2%E9%9B%86%23%23%2Fpy%2FlNGbmZjY3YmN_2.html%23%231479%23%2368%24%24%E6%B3%95%E5%8C%BB%E7%A7%A6%E6%98%8E%E7%AC%AC03%E9%9B%86%23%23%2Fpy%2FlRmNkdmZsRmN_3.html%23%23837%23%2342%24%24%E4%BA%BA%E5%86%8D%E5%9B%A7%E9%80%94%E4%B9%8B%E6%B3%B0%E5%9B%A7%E6%AD%A3%E7%89%87%23%23%2Fpy%2FlNmZkRjYjlDa_1.html%23%231206%23%2319%24%24%E9%80%9F%E5%BA%A6%E4%B8%8E%E6%BF%80%E6%83%853%EF%BC%9A%E4%B8%9C%E4%BA%AC%E6%BC%82%E7%A7%BBHD1280%E9%AB%98%E6%B8%85%E4%B8%AD%E5%AD%97%E7%89%88%23%23%2Fpy%2FlNWZ5wWb2ADb_1.html%23%23783%23%2313%24%24%E9%80%9F%E5%BA%A6%E4%B8%8E%E6%BF%80%E6%83%856HD1280%E9%AB%98%E6%B8%85%E4%B8%AD%E8%8B%B1%E5%8F%8C%E5%AD%97%E7%89%88%23%23%2Fpy%2FlNWZ5wWb2MzZ_1.html%23%235487%23%2370%24%24%E7%88%B1%E6%83%85%E5%85%AC%E5%AF%93%E7%AC%AC02%E9%9B%86%23%23%2Fpy%2FlRWZ3kGatNDZ_2.html%23%235%23%230%24%24; _pk_ref.1.f469=%5B%22%22%2C%22%22%2C1628861961%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_ses.1.f469=1; PHPSESSID=iihse8vlb216gg5fdg3gp1enr2; Pianku_auth=06c5N-Pw7RaPoL7AOK2gZ9aIXMWGJb9xfbzdsIoHXUhn5Z-bGs1l68_Lhs7og6jww6iG_WLRbyEnhRXTbu_vthptMBgPXAm5yYV9rJlFKV2fnCs086hqg2uotFTErgHhyApWJTPsdVY19PwZJf_HwhyE7FcC83swIfUitbx_hsbqF2XVCp-zj5IU12U; Pianku_cookietime=95b9iaLnS9KECMWRwUf-834BhwpxfnYShmmwOys_Yp0DjXcSV_C1"
}
request=Request(url,headers=headers)
response=urlopen(request)
print(response.read().decode())

老师，请问导入下面这行代码是什么意思呢?
from urllib.request import HTTPCookieProcessor

Python 全系列/第十六阶段：Python 爬虫开发/scrapy框架使用（旧） 633楼

老师，pip install fake-useragent命令是下面这种情况

Python 全系列/第十六阶段：Python 爬虫开发/scrapy框架使用（旧） 634楼

使用docker toolbox安装docker，请问老师报这个错应该怎么处理

yuyyuuuyuyu

Python 全系列/第十六阶段：Python 爬虫开发/动态数据抓取 635楼

from urllib.request import Request,urlopen
# import ssl
url="https://www.12306.cn/index/"
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67"
    }

# context=ssl.create_default_context()
request=Request(url,headers=headers)
response=urlopen(request)
print(response.read().decode())


老师，我这里无论用不用ssl他都可以直接访问12306的网站啊，请问这是我UA的问题嘛还是什么？

Python 全系列/第十六阶段：Python 爬虫开发/爬虫基础（旧） 636楼

from urllib.request import urlopen,Request
from urllib.parse import quote
def get_html(url):
    headers={
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67"
    }
    request=Request(url,headers=headers)
    response=urlopen(request)
    return response.read().decode()
def save_html(html,filename):
    with open(filename,'w',encoding='utf-8') as f:
        f.write(html)

def main():
    content = input("请输入要获取的贴吧:")
    num = int(input("请输入要获取的页:"))

    for i in range(num):
        url="https://tieba.baidu.com/f?kw=' + quote(content) + '&ie=utf-8&pn={}".format(i*50)
        print(url)
        html=get_html(url)
        filename = '第' + str(i + 1) + '页.html'
        save_html(html,filename)
    # url = "https://tieba.baidu.com/f?kw=%E5%B0%9A%E5%AD%A6%E5%A0%82&ie=utf-8&pn=50"
    # html = get_html(url)
    # save_html(html)
    # url = "https://tieba.baidu.com/f?kw=%E5%B0%9A%E5%AD%A6%E5%A0%82&ie=utf-8&pn=100"
    # html = get_html(url)
    # save_html(html)

if __name__ == '__main__':
    main()
    
    
 老师，请问一下这个运行出来就只有下面那一行成功了，后面就报错是怎么回事儿呀?麻烦老师帮我看一下

Python 全系列/第十六阶段：Python 爬虫开发/爬虫基础（旧） 637楼

半夜突然醒来，闲着无聊敲了个爬虫的代码，用selenium实现一个自动登录之类的，代码如下所示：

"""
   用 selenium 实现对中国大学mocc的登录
   程序运行报错：正在处理中
"""
from selenium import webdriver
from time import sleep
fox = webdriver.Firefox()
url ='https://www.icourse163.org/member/login.htm?returnUrl=aHR0cHM6Ly93d3cuaWNvdXJzZTE2My5vcmcvaW5kZXguaHRt#/webLoginIndex'
fox.get(url)
sleep(3)

# 点击登录按钮,弹出登录界面
fox.find_element_by_css_selector('#auto-id-1628452551743').click()
sleep(1)
# 选择其他登录方式
fox.find_element_by_css_selector('#login-cnt > div > div > div > div.ux-login-set-scan-code_ft > span').click()
# 获取账号框和密码框，输入密码
fox.find_element_by_css_selector('#auto-id-1628452775609').send_keys('*************')
sleep(1)
fox.find_element_by_css_selector('#auto-id-1628452775612').send_keys('*************')
sleep(1)
fox.find_element_by_css_selector('#dologin').click()
sleep(1)

print(fox.current_url)
print(fox.page_source)
sleep(5)


sleep(2)
fox.quit()

报错信息如下：

C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe D:/pythonProject2/实战python网络爬虫/selenium的使用/selenium_03.py

Traceback (most recent call last):

File "D:\pythonProject2\实战python网络爬虫\selenium的使用\selenium_03.py", line 13, in <module>

fox.find_element_by_css_selector('#auto-id-1628452551743').click()

File "C:\Users\Administrator\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 598, in find_element_by_css_selector

return self.find_element(by=By.CSS_SELECTOR, value=css_selector)

File "C:\Users\Administrator\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 976, in find_element

return self.execute(Command.FIND_ELEMENT, {

File "C:\Users\Administrator\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute

self.error_handler.check_response(response)

File "C:\Users\Administrator\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response

raise exception_class(message, screen, stacktrace)

selenium.common.exceptions.NoSuchElementException: Message: Unable to locate element: #auto-id-1628452551743

无法定位到元素，敢问老师怎么处理？

进程已结束，退出代码为 1

Python 全系列/第十六阶段：Python 爬虫开发/爬虫反反爬- 638楼

请问他这个怎么不变呀？

from urllib.request import urlopen
from urllib.request import Request

url = 'http://www.baidu.com/'

# 定义User-Agent变量
headers = {
    "User-Agent":" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67"
}
# 封装request对象
request = Request(url, headers=headers)
# 发送请求
response = urlopen(url)

print(response.read().decode())

Python 全系列/第十六阶段：Python 爬虫开发/爬虫基础（旧） 639楼

老师，我这个是什么情况，在设置证书的时候出现这个，怎么解决

Python 全系列/第十六阶段：Python 爬虫开发/爬虫基础（旧） 640楼

我登陆的网站参数有点多，如何解决，需要解决这个验证码，然后excution是什么？

Python 全系列/第十六阶段：Python 爬虫开发/爬虫基础（旧） 641楼

from urllib.request import Request, build_opener, ProxyHandler
from fake_useragent import UserAgent


url = 'http://httpbin.org/get'

headers = {"User-Agent": UserAgent().chrome}
req = Request(url, headers=headers)
#
handler = ProxyHandler({'http': '103.45.147.157:16817'})

opener = build_opener(handler)
resp = opener.open(req)
print(resp.read().decode())

老师我这个proxy无论写什么都是返回主机的IP 空字符串也是本机IP 为什么？？

Python 全系列/第十六阶段：Python 爬虫开发/scrapy框架使用（旧） 642楼

"""
    爬取淘宝商品价格
"""
import requests
from lxml import etree


def get_html_text(url):
    ua = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'

    }
    try:

        resp = requests.get(url, headers=ua)
        resp.raise_for_status()
        resp.encoding = resp.apparent_encoding
        return resp.text
    except BaseException as e:
        print(e)


def parser_page(html):
    e = etree.HTML(html)
    prices = e.xpath("//div[@class='price g_price g_price-highlight']/strong/text()")
    product_infos = e.xpath("//div[@class='row row-2 title']/a/text()")
    for price, info in zip(prices, product_infos):
        print(f"商品:{info.strip()}\n价格:{price}\n\n\n")


def main():
    url ='https://s.taobao.com/search?q=python书籍&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_2'
    html = get_html_text(url)
    parser_page(html)

main()

老师，我这里有一个爬取商品价格的爬虫，运行后显示进程结束，什么都没有，我是被反爬了吗？

Python 全系列/第十六阶段：Python 爬虫开发/爬虫反反爬- 643楼

fiddler是要钱的吗？有没有免费的呀

Python 全系列/第十六阶段：Python 爬虫开发/爬虫基础（旧） 644楼

老师，学到什么程度可以当爬虫工程师呢？需要达到什么样的水准？

Python 全系列/第十六阶段：Python 爬虫开发/移动端爬虫开发- 645楼

同学您好