import requests from fake_useragent import UserAgent from bs4 import BeautifulSoup from time import sleep def get_list(): num = int(input("请输入要获取多少页数据:")) for i in range(num): url = "https://maoyan.com/films?showType=3&offset={i*30}" headers = {'User-Agent':UserAgent().chrome} resp = requests.get(url,headers=headers) soup = BeautifulSoup(resp.text,'lxml') all_a = soup.select('div > a[data-act="movies-click"]') print(all_a) return [a.get('href') for a in all_a] # 格式化演员集合,去重 def format_actors(a_list): actor_set = set() for a in a_list: actor_set.add(a.text.strip()) # 需要a标签里的文本,去除空格 return actor_set def start(): all_href = get_list() for a in all_href: sleep(2) url = f"https://maoyan.com{a}" headers = {'User-Agent':UserAgent().chrome} resp = requests.get(url,headers=headers) soup = BeautifulSoup(resp.text,'lxml') name = soup.select('h1.name')[0].text.strip() types = soup.select('li.ellipsis')[0].text.strip() actors_m = soup.select('li.celebrity.actor > div > a') actors = format_actors(actors_m) print(f'电影名:{name} 电影类型:{types} 演员:{actors}') if __name__ == '__main__': start()
老师,帮我看看这是为什么?我的输出没有结果!
from pymongo import MongoClient
# 获取链接
c1 = MongoClient('localhost',27017)
c2 = MongoClient('mongodb://localhost:27017')
# 获取数据库实例
db = c1.sxt
db2 = c1["sxt"]
print(db == db2)
# 获取集合
p1 = db.person
p2 = db['person']
print(p1==p2)
#操作数据
for p in p1.find():
print(p)
连接没问题,输出不出来数据
''' 出现中文乱码的问题,编码形式改为‘utf-8’; 出现\u这种问题,照着dumps()方法中的ensure_ascii=False进行修改。 ''' import json str_1 = '{"name":"战狼3"}' print(type(str_1)) # 返回<class 'str'> print(str_1) #返回{"name":"战狼3"} print('----str_1 to obj_1----') # 把Json格式字符串解码转换成Python对象 obj_1 = json.loads(str_1) print(obj_1) # 返回{'name': '战狼3'} print(type(obj_1)) # 返回<class 'dict'> print('----obj_1 to str_2----') # 实现python类型转化为json字符串,返回⼀个str_2对象 str_2 = json.dumps(obj_1) # str_2返回Unicode编码 # str_2 = json.dumps(obj_1,ensure_ascii=False) # 将ensure_ascii=False,可使Unicode编码转为原生编码 print(str_2) # 未将ensure_ascii=False,则返回{"name": "\u6218\u72fc3"} # print(str_2) # 将ensure_ascii=False,则返回{"name": "战狼3"} print(type(str_2)) # 返回<class 'str'> print('----obj_1 to file----') # obj_1 to file,将Python内置类型序列化为json对象后写⼊⽂件 json.dump(obj_1,open('movie.txt','w',encoding='utf-8',ensure_ascii=False)) # 写文件movie.txt print('----file to obj_2----') # file to obj_2,读取⽂件中json形式的字符串元素 转化成python类型 obj_2 = json.load(open('movie.txt',encoding='utf-8')) print(obj_2) # encoding = 'utf-8',返回{'name': '战狼3'} print(type(obj_2)) # 返回<class 'dict'>
老师,你好,我测试了一下这个错误点一个是在开头注释的\u,然后还有写movie文件中的ensure_ascii=False,好像是无效的关键字参数,那我写在注释里的\u为什么是错的,然后还有‘写文件’中出现这样的情况怎么把它改成中文。对于这个编码之间的转换不太了解,试着掌握很多次了,可印象还是不深刻。
老师这怎么切换linux默认的python版本啊
老师可以分享一下软件安装吗 ,官网下载不了
还有59 60 这两节啊 不是只有一节黑屏
老师,点运行一直下面这个界面,显示的看不到
老师这个章节没画面啊 黑屏了
老师这是啥原因debug看url 之类的都没问题啊 数据爬取不出来而且还报这个
scrapy02.zip
from fake_useragent import UserAgent import requests import re url="https://www.qiushibaike.com/text/" headers={ "User-Agent":UserAgent().chrome } response=requests.get(url,headers=headers) print(response.text) contents = re.findall(r'<div class = "content">\s*<span>\s*(.+)',response.text) with open('qiushi.txt','a',encoding='utf-8') as f: for info in contents : f.write(info+"\n\n") 老师,运行后txt文本里没有数据,麻烦您帮我看一下
课程里那个http://www.sxt.cn/index/user.html显示不存在咋办
老师,请问degug不可视是什么原因呢?
import requests,re import json from requests.api import head url ="http://api-my.le.com/vcm/api/list?jsonp=jQuery19108692358134849103_1628923285688&type=video&rows=20&page=1&sort=&cid=2&source=1&xid=74513092&pid=10058550&ctype=cmt%2Cimg%2Cvote&listType=1&_=1628923285700" ua={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' } html = requests.get(url,headers=ua).text print("-"*500) contents = re.findall(r'"content":"(.*?)"',html) print(contents,len(contents),sep="\n") new_contens=[content.encode('utf-8').decode('unicode-escape') for content in contents] print(new_contens)
老师,为什么unicode字符串不能全部转成中文?查了半天,还是还没搞出来?
怎么处理,麻烦不吝赐教。谢谢。
# _*_coding=utf-8 _*_ from fake_useragent import UserAgent import requests from lxml import etree from time import sleep def get_html(url): """ :param url: 要爬取的url :return返回html """ headers = { "User-Agent": UserAgent().chrome } resp = requests.get(url, headers=headers) sleep(3) if resp.status_code == 200: resp.encoding = 'utf-8' return resp.text else: return None def parse_list(html): """ :param html: 传递进来一个有电影列表的html :return 返回一个电影列表的url """ e = etree.HTML(html) list_url = ['https://maoyan.com'+ url for url in e.xpath('//div[@class="movie-item-hover"]/a/@href')] return list_url def parse_index(html): """ :param html: 传递进来一个有电影信息的url :return 已经提取好的电影信息 """ e = etree.HTML(html) names = e.xpath('//h1/text()')[0] type = e.xpath('//li[@class="ellipsis"]/a/text()')[0] actor = e.xpath('//ul[@class="celebrity-list clearfix"]/li[@class="celebrity actor"]/div/a/text()') actors = format_actor(actor) return {'name': names, 'type': type, 'actor': actors} def format_actor(actors): actor_set = set() # 去重 for actor in actors: actor_set.add(actor.strip()) return actor_set def main(): num = int(input('请输入要获取多少页数据')) for y in range(num): url = 'https://maoyan.com/films?showType=3&offset={}'.format(y*30) # print(url) list_html = get_html(url) list_url = parse_list(list_html) for url in list_url: # print(url) info_html = get_html(url) movie = parse_index(info_html) print(movie) if __name__ == '__main__': main()
老师为啥没有数据啊!
from urllib.request import Request,build_opener,urlopen from fake_useragent import UserAgent from urllib.parse import urlencode from urllib.request import HTTPCookieProcessor login_url="https://www.mypianku.net/user/login/" form_data={ "user":"18628906572", "password":"x..k.d$2000$" } headers={ "User-Agent":UserAgent().random} req=Request(login_url,headers=headers,data=urlencode(form_data).encode()) opener=build_opener() resp=opener.open(req) #---------------------------登录成功-------------------- url="https://www.mypianku.net/user/account/" headers={ "User-Agent":UserAgent().random, "Cookie":"_pk_id.1.f469=6d7b76987328fd10.1626832666.; d_c=d_1412928085; vrg_sc=4bb1ca6010ff37986d716442a23afa73; vrg_go=1; pyad=2; player123=%E6%B2%90%E6%B5%B4%E4%B9%8B%E7%8E%8BHD%23%23%2Fpy%2FlNmZxwmZsBTZ_1.html%23%235812%23%2394%24%24%E4%B8%80%E8%B7%AF%E6%83%8A%E5%96%9C%E6%AD%A3%E7%89%87%23%23%2Fpy%2FlNmZoRWMp1WM_1.html%23%23271%23%234%24%24%E4%B8%80%E8%B7%AF%E6%83%8A%E5%96%9C%E9%AB%98%E6%B8%85%23%23%2Fpy%2FlNGbrBTZ2wmY_1.html%23%23477%23%238%24%24%E6%B3%95%E5%8C%BB%E7%A7%A6%E6%98%8E2%E6%B8%85%E9%81%93%E5%A4%AB%E7%AC%AC2%E9%9B%86%23%23%2Fpy%2FlNGbmZjY3YmN_2.html%23%231479%23%2368%24%24%E6%B3%95%E5%8C%BB%E7%A7%A6%E6%98%8E%E7%AC%AC03%E9%9B%86%23%23%2Fpy%2FlRmNkdmZsRmN_3.html%23%23837%23%2342%24%24%E4%BA%BA%E5%86%8D%E5%9B%A7%E9%80%94%E4%B9%8B%E6%B3%B0%E5%9B%A7%E6%AD%A3%E7%89%87%23%23%2Fpy%2FlNmZkRjYjlDa_1.html%23%231206%23%2319%24%24%E9%80%9F%E5%BA%A6%E4%B8%8E%E6%BF%80%E6%83%853%EF%BC%9A%E4%B8%9C%E4%BA%AC%E6%BC%82%E7%A7%BBHD1280%E9%AB%98%E6%B8%85%E4%B8%AD%E5%AD%97%E7%89%88%23%23%2Fpy%2FlNWZ5wWb2ADb_1.html%23%23783%23%2313%24%24%E9%80%9F%E5%BA%A6%E4%B8%8E%E6%BF%80%E6%83%856HD1280%E9%AB%98%E6%B8%85%E4%B8%AD%E8%8B%B1%E5%8F%8C%E5%AD%97%E7%89%88%23%23%2Fpy%2FlNWZ5wWb2MzZ_1.html%23%235487%23%2370%24%24%E7%88%B1%E6%83%85%E5%85%AC%E5%AF%93%E7%AC%AC02%E9%9B%86%23%23%2Fpy%2FlRWZ3kGatNDZ_2.html%23%235%23%230%24%24; _pk_ref.1.f469=%5B%22%22%2C%22%22%2C1628861961%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D; _pk_ses.1.f469=1; PHPSESSID=iihse8vlb216gg5fdg3gp1enr2; Pianku_auth=06c5N-Pw7RaPoL7AOK2gZ9aIXMWGJb9xfbzdsIoHXUhn5Z-bGs1l68_Lhs7og6jww6iG_WLRbyEnhRXTbu_vthptMBgPXAm5yYV9rJlFKV2fnCs086hqg2uotFTErgHhyApWJTPsdVY19PwZJf_HwhyE7FcC83swIfUitbx_hsbqF2XVCp-zj5IU12U; Pianku_cookietime=95b9iaLnS9KECMWRwUf-834BhwpxfnYShmmwOys_Yp0DjXcSV_C1" } request=Request(url,headers=headers) response=urlopen(request) print(response.read().decode())
老师,请问导入下面这行代码是什么意思呢? from urllib.request import HTTPCookieProcessor
非常抱歉给您带来不好的体验!为了更深入的了解您的学习情况以及遇到的问题,您可以直接拨打投诉热线:
我们将在第一时间处理好您的问题!
关于
课程分类
百战程序员微信公众号
百战程序员微信小程序
©2014-2025百战汇智(北京)科技有限公司 All Rights Reserved 北京亦庄经济开发区科创十四街 赛蒂国际工业园网站维护:百战汇智(北京)科技有限公司 京公网安备 11011402011233号 京ICP备18060230号-3 营业执照 经营许可证:京B2-20212637