# coding=utf-8import requestsfrom lxml import etree'''爬取猫眼网站TOP100的电影数据并写为json格式文件'''# 通过起始url获取要爬取url列表def get_url(url): url_list = [url] for num in range(10, 91, 10): data = { 'offset': num} url_list.append(requests.get(url, params=data).url) return url_list# 通过url获取页面信息def get_html(url): header = { 'User-Agent': 'Mozilla/5.0 (X11;Ubuntu;Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'} html = requests.get(url,headers=header ) # html.encoding = 'utf-8' html = html.text return html# 使用xpath定位元素,爬取原始数据def get_element(html): html = etree.HTML(html) img_href = html.xpath("//dl[@class='board-wrapper']/dd/a/@href") title = html.xpath("//dl[@class='board-wrapper']/dd/a/@title") actress = html.xpath("//div[@class='movie-item-info']/p[2]/text()") relasetime = html.xpath("//div[@class='movie-item-info']/p[3]/text()") all_data = [img_href,title,actress,relasetime] return all_data# 清理爬取的数据def clear_data(data): from urllib.parse import urljoin url = 'http://maoyan.com/' ''' img_href中的href添加url头部,/films/1203 actress中去除换行符和多余空行 ''' img_url = [] actor = [] img_href = data[0] actress = data[2] for tail in img_href: img_url.append(urljoin(url,tail)) data[0] = img_url for act in actress: actor.append(act.split()) data[2] = actor return data# 将爬取下来的数据整理为字典格式def json_dict(data): ''' 将数据整理为: {title:{主演:演员,img:img,时间:上映时间}} ''' json_dic = {} title = data[1] actress = data[2] releasetime = data[3] img = data[0] for item in title: json_dic[item] = {} # 处理演员列表 for i in range(len(actress)): each_actor = actress[i][0] key = each_actor.split(':')[0] value = each_actor.split(':')[1] json_dic[item] = {key:value} # 处理上映时间列表 for each_time in releasetime: rt = each_time.split(':')[0] T = each_time.split(':')[1] json_dic[item].update({rt:T}) # 处理图片路径列表 for each_img in img: json_dic[item].update({ 'img_url':each_img}) return json_dic# 转为json文件def dump_json(dic,filename): import os,json abspath = os.path.join(os.path.abspath('.'),filename) # 不加ensure_ascii写入文件会被编码为ASCII # indent参数为格式化保存字典 with open(abspath,'w',encoding='utf-8') as f: json.dump(dic,f,indent=4,ensure_ascii=False)# 主函数def main(url): url_list = get_url(url) json_dic = {} for url in url_list: html = get_html(url) data = get_element(html) json_dic.update(json_dict(clear_data(data))) dump_json(json_dic,'maoyan.json')if __name__ == '__main__': url = 'http://maoyan.com/board/4' main(url)