博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
爬取猫眼网站TOP100的电影数据并写为json格式文件
阅读量:5360 次
发布时间:2019-06-15

本文共 2771 字,大约阅读时间需要 9 分钟。

# coding=utf-8import requestsfrom lxml import etree'''爬取猫眼网站TOP100的电影数据并写为json格式文件'''# 通过起始url获取要爬取url列表def get_url(url):    url_list = [url]    for num in range(10, 91, 10):        data = {
'offset': num} url_list.append(requests.get(url, params=data).url) return url_list# 通过url获取页面信息def get_html(url): header = {
'User-Agent': 'Mozilla/5.0 (X11;Ubuntu;Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'} html = requests.get(url,headers=header ) # html.encoding = 'utf-8' html = html.text return html# 使用xpath定位元素,爬取原始数据def get_element(html): html = etree.HTML(html) img_href = html.xpath("//dl[@class='board-wrapper']/dd/a/@href") title = html.xpath("//dl[@class='board-wrapper']/dd/a/@title") actress = html.xpath("//div[@class='movie-item-info']/p[2]/text()") relasetime = html.xpath("//div[@class='movie-item-info']/p[3]/text()") all_data = [img_href,title,actress,relasetime] return all_data# 清理爬取的数据def clear_data(data): from urllib.parse import urljoin url = 'http://maoyan.com/' ''' img_href中的href添加url头部,/films/1203 actress中去除换行符和多余空行 ''' img_url = [] actor = [] img_href = data[0] actress = data[2] for tail in img_href: img_url.append(urljoin(url,tail)) data[0] = img_url for act in actress: actor.append(act.split()) data[2] = actor return data# 将爬取下来的数据整理为字典格式def json_dict(data): ''' 将数据整理为: {title:{主演:演员,img:img,时间:上映时间}} ''' json_dic = {} title = data[1] actress = data[2] releasetime = data[3] img = data[0] for item in title: json_dic[item] = {} # 处理演员列表 for i in range(len(actress)): each_actor = actress[i][0] key = each_actor.split(':')[0] value = each_actor.split(':')[1] json_dic[item] = {key:value} # 处理上映时间列表 for each_time in releasetime: rt = each_time.split(':')[0] T = each_time.split(':')[1] json_dic[item].update({rt:T}) # 处理图片路径列表 for each_img in img: json_dic[item].update({
'img_url':each_img}) return json_dic# 转为json文件def dump_json(dic,filename): import os,json abspath = os.path.join(os.path.abspath('.'),filename) # 不加ensure_ascii写入文件会被编码为ASCII # indent参数为格式化保存字典 with open(abspath,'w',encoding='utf-8') as f: json.dump(dic,f,indent=4,ensure_ascii=False)# 主函数def main(url): url_list = get_url(url) json_dic = {} for url in url_list: html = get_html(url) data = get_element(html) json_dic.update(json_dict(clear_data(data))) dump_json(json_dic,'maoyan.json')if __name__ == '__main__': url = 'http://maoyan.com/board/4' main(url)

 

转载于:https://www.cnblogs.com/scorpionSpace/p/9274292.html

你可能感兴趣的文章
MySQLdb & pymsql
查看>>
zju 2744 回文字符 hdu 1544
查看>>
【luogu P2298 Mzc和男家丁的游戏】 题解
查看>>
前端笔记-bom
查看>>
上海淮海中路上苹果旗舰店门口欲砸一台IMAC电脑维权
查看>>
Google透露Android Market恶意程序扫描服务
查看>>
给mysql数据库字段值拼接前缀或后缀。 concat()函数
查看>>
迷宫问题
查看>>
【FZSZ2017暑假提高组Day9】猜数游戏(number)
查看>>
泛型子类_属性类型_重写方法类型
查看>>
练习10-1 使用递归函数计算1到n之和(10 分
查看>>
Oracle MySQL yaSSL 不明细节缓冲区溢出漏洞2
查看>>
Code Snippet
查看>>
zoj 1232 Adventure of Super Mario
查看>>
组合数学 UVa 11538 Chess Queen
查看>>
oracle job
查看>>
Redis常用命令
查看>>
[转载]电脑小绝技
查看>>
windos系统定时执行批处理文件(bat文件)
查看>>
thinkphp如何实现伪静态
查看>>