有效沟通问答-【官方】百战程序员_IT在线教育培训机构

def parse_info(self, response):
    total_price = response.xpath('concat(//span[@class="total"]/text(),//span[@class="unit"]/span/text())').extract_first()
    community_name = response.xpath('//div[@class="communityName"]/a[@class="info"]/text()').extract_first()
    area_name = response.xpath('string(//div[@class="areaName"]/span[2])').extract_first()

老师为什么我获得response之后用xpath提取信息，能够正确获得总价total_price，但是小区名称就一直返回None，xpath表达式没有错，在浏览器上用xpath helper也能提取出来，就是pycharm调试的时候不行

Python全系列/第十六阶段：Python 爬虫开发/动态数据抓取 856楼

老师，这里爬到一些数据像是转码过了，有什么办法解码吗

Python全系列/第十六阶段：Python 爬虫开发/爬虫基础（旧） 857楼

from fake_useragent import UserAgent
import requests
from lxml import etree  #解析库


def get_html(url):
    '''
    param url:要爬取的地址
    return:返回html
    '''

    headers={"User-Agent":UserAgent().chrome}
    resp=requests.get(url,headers=headers) #发送请求
    if resp.status_code ==200:
        resp.encoding='utf-8'  #设置字符集
        return resp.text
    else:
        return None

def parse_list(html):
    '''
    param url:传递进来一个有电影列表的html
    return:返回一个电影列表的url
    '''
    # if html:    #html在有值的情况下在做解析
    e=etree.HTML(html)
    list_url=['http://maoyan.com{}'.format(url) for url in e.xpath('//div[@class="movie-item-hover"]/a/@href')]
    return list_url

def pares_index(html):
    '''
    param url:传递进来一个有电影具体信息的html
    return:提取好的电影具体信息
    '''
    e = etree.HTML(html)
    name = e.xpath('//h1[@class="name"]/text()')
    type = e.xpath('//li[@class="ellipsis"][1]/a/text()')
    content = e.xpath('//span[@class="dra"]/text()')

    return {"name":name,"type":type,"content":content}

#     actors=e.xpath('')
#     actors=format_data(actors)

# 有重复的演员名字，需要去重
# def format_data(actors):
#     actor_set=set()
#     for actor in actors:
#         actor_set.add(actor.strip())  #strip去空格
#         return actor_set

def main():

    '''控制上述方法的实施，分配相应的url'''
    num = int(input('请输入多少页:'))
    for page in range(num):
        url="https://maoyan.com/films?showType=1&offset={}".format(page*30)

        list_html=get_html(url)  #发送请求
        list_url=parse_list(list_html)  #解析list_html,返回list_url每个电影的信息
        for url in list_url:
            info_html=get_html(url)
            movie = pares_index(info_html)
            print(movie)

if __name__ =='__main__':
    main()

老师，这个代码只运行到请输入多少页，然后就运行结束了。哪里出错了？

Python全系列/第十六阶段：Python 爬虫开发/爬虫反反爬- 858楼

老师我的代码是按照视频里面敲的，但是运行的时候就会出错，只可以爬取几条数据，麻烦老师帮我看下子

from selenium import webdriver
from time import sleep
from lxml import etree

# 构造浏览器
chrome = webdriver.Chrome()
# 发送请求
url = 'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC&pvid=ce58b28b72ae48c190c8451125b8f894'
chrome.get(url)

# 拉动滚动条到底部，注意的是代码应该写在获取网页源码之前
js = 'document.documentElement.scrollTop=100000'
chrome.execute_script(js)
sleep(3)

html = chrome.page_source
e = etree.HTML(html)
# titles = e.xpath('//div[@class="p-name p-name-type-2"]/a/em/text()')
titles = e.xpath('//div[@id="J_goodsList"]//div[@class="p-name p-name-type-2"]/a/@title')
prices = e.xpath('//div[@id="J_goodsList"]//div[@class="p-price"]/strong/i/text()')

for title,price in zip(titles,prices):
    print(title,':',price)
print(len(price))
chrome.quit()

Python全系列/第十六阶段：Python 爬虫开发/爬虫反反爬- 859楼

class ZolSpider(scrapy.Spider):
    name = 'zol'
    allowed_domains = ['faloo.com']
    start_urls = ['https://b.faloo.com/1216400_1.html']

    def parse(self, response):
        name = response.xpath('//h1/text()').extract_first()

        content = response.xpath('//div[@class="noveContent"]/p/text()').extract()

        next_url = response.xpath('//*[@id="next_page"]/@href').get()

        yield {
            'name':name,
            'content':content
        }
       
        yield scrapy.Request('https:'+next_url,callback=self.parse)

老师我有两个问题 parse这个方法是只要有url无限传递那他就会无限爬取吗类似于爬虫加个for循环吗

这样只有在没有url了以后才会停止爬取那要是想控制爬取的次数或者说达到一定条件停止爬取那这样应该怎么控制呢 yield有能像for循环那样能跳出语句的操作吗

Python全系列/第十六阶段：Python 爬虫开发/scrapy框架使用 860楼

from fake_useragent import UserAgent
import requests
from pyquery import PyQuery as pq


url = 'https://www.qidian.com/finish'
headers = {'User-Agent':UserAgent().random}

resp = requests.get(url,headers=headers)

doc = pq(resp.text)

name = [a.text for a in doc('h4 a')]
book = doc('a[class="name"]').text()

老师，怎么使用pyquery提取作者名字啊，里面含有跟作者名称一样的a标签，我试了好久就是没提取出来，而且这个是不规律的，没法使用if进行筛选

Python全系列/第十六阶段：Python 爬虫开发/爬虫反反爬- 861楼

老师，这是怎么回事啊?我的数据库连接正常啊，但是tomcat出不来

Python全系列/第十六阶段：Python 爬虫开发/docker容器扩展-旧20230925 862楼

url = f'https://www.maoyan.com/films?showType=2&offset=0'
headers = {
    'User-Agent': UserAgent().chrome
}
resp = requests.get(url , headers=headers)
e = etree.HTML(resp.text)
movie_title = e.xpath('//dl[@class="movie-list"]/dd/div[@class="channel-detail movie-item-title"]/@title')
# movie_href = e.xpath("//dl[@class='movie-list']/dd/div[@class='channel-detail movie-item-title']/a/@href")
print(movie_title)
# for i in movie_title:
#     print(i)

老师，为啥我获取到的title和href值都只有1个

Python全系列/第十六阶段：Python 爬虫开发/爬虫基础（旧） 863楼

找了所有浏览器连负载都没有都不介绍一下

Python全系列/第十六阶段：Python 爬虫开发/爬虫基础 864楼

class ZolSpider(scrapy.Spider):
    name = 'zol'
    allowed_domains = ['faloo.com']
    start_urls = ['https://b.faloo.com/1216400_1.html']

    def parse(self, response):
        name = response.xpath('//h1/text()').extract_first()

        content = response.xpath('//div[@class="noveContent"]/p/text()').extract()

        next_url = response.xpath('//*[@id="next_page"]/@href').get()

        yield {
            'name':name,
            'content':content
        }
       
        yield scrapy.Request('https:'+next_url,callback=self.parse)

老师我有两个问题 parse这个方法是只要有url无限传递那他就会无限爬取吗类似于爬虫加个for循环

Python全系列/第十六阶段：Python 爬虫开发/scrapy框架使用 865楼

请问他这个怎么不变呀？

from urllib.request import urlopen
from urllib.request import Request

url = 'http://www.baidu.com/'

# 定义User-Agent变量
headers = {
    "User-Agent":" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67"
}
# 封装request对象
request = Request(url, headers=headers)
# 发送请求
response = urlopen(url)

print(response.read().decode())

Python全系列/第十六阶段：Python 爬虫开发/爬虫基础（旧） 866楼

from urllib.request import Request,build_opener,HTTPCookieProcessor
from urllib.parse import urlencode  #转换用的
from fake_useragent import UserAgent
from http.cookiejar import MozillaCookieJar#保存cookie得文件需要引进的模块

def get_cookie():
    login_url='http://learn.open.com.cn/Account/Login'#登录网站的url
    from_data={
        "user": "jxt17703612482",
        "password": "JXTjxt00"
    }
    headers={"UserAgent":UserAgent().random}
    rep=Request(login_url, headers=headers, data=urlencode(from_data).encode())

    cookie_jar=MozillaCookieJar()#保存cookie
    handler=HTTPCookieProcessor(cookie_jar)#参数cookie
    opener=build_opener(handler)
    resp=opener.open(rep)
    cookie_jar.save('cookie.txt',ignore_discard=True,ignore_expires=True)#【！】保存cookie，在cookie.txt文件夹里

def use_cookie():
    info_url='http://learn.open.com.cn/StudentCenter/MyCourse/MyCourseDetail?CourseID=69249&CourseIndex=0'#登录个人中心的url
    headers = {"UserAgent": UserAgent().random}
    rea=Request(info_url,headers=headers)
    cookie_jar=MozillaCookieJar()
    cookie_jar.load('cookie.txt',ignore_discard=True,ignore_expires=True)#加载用cookie_jar.load
    handler=HTTPCookieProcessor(cookie_jar)
    opener=build_opener(handler)
    resp=opener.open(rea)
    print(resp.read().decode())


if __name__ == '__main__':
    get_cookie()
    # use_cookie()

老师，我这显示以下错误，上面是我得代码。我搞了一个多小时了，也没整明白，你帮我看看。

D:\pythonDownloads\python.exe E:/demo1/test12/pdemo/15cookie的使用3.py

Traceback (most recent call last):

File "E:/demo1/test12/pdemo/15cookie的使用3.py", line 34, in <module>

get_cookie()

File "E:/demo1/test12/pdemo/15cookie的使用3.py", line 18, in get_cookie

resp=opener.open(rep)

File "D:\pythonDownloads\lib\urllib\request.py", line 531, in open

response = meth(req, response)

File "D:\pythonDownloads\lib\urllib\request.py", line 641, in http_response

'http', request, response, code, msg, hdrs)

File "D:\pythonDownloads\lib\urllib\request.py", line 569, in error

return self._call_chain(*args)

File "D:\pythonDownloads\lib\urllib\request.py", line 503, in _call_chain

result = func(*args)

File "D:\pythonDownloads\lib\urllib\request.py", line 649, in http_error_default

raise HTTPError(req.full_url, code, msg, hdrs, fp)

urllib.error.HTTPError: HTTP Error 403: Forbidden

Process finished with exit code 1

Python全系列/第十六阶段：Python 爬虫开发/scrapy框架使用（旧） 867楼

请问3分45秒这里不能加载时，重启操作时怎么做到的？

Python全系列/第十六阶段：Python 爬虫开发/爬虫基础 868楼

老师，麻烦帮忙看一下我的代码：

源码：

作业_爬取拉钩职位.zip

运行结果中不能爬取到所有页面

另外保存的结果中有太多空格和\n

麻烦老师协助解决下，谢谢！

Python全系列/第十六阶段：Python 爬虫开发/爬虫反反爬- 869楼

老师在安装一些库的时候下载实在是太慢了，建议写文档的时候加上一个

国内的镜像源，这是我常用的清华镜像源：https://pypi.tuna.tsinghua.edu.cn/simple

Python全系列/第十六阶段：Python 爬虫开发/爬虫基础 870楼

同学您好