Dowemo
0 0 0 0

HTMLParser

  • Method annotation
from html.parser import HTMLParser#classMyParser(HTMLParser):"""
 HTMLParser
"""def__init__(self): HTMLParser.__init__(self)
 defhandle_startendtag(self, tag, attrs): super().handle_startendtag(tag, attrs)
 # 处理开始标签<a>defhandle_starttag(self, tag, attrs):pass# 处理结束标签</a>defhandle_endtag(self, tag):pass# 处理特殊字符串,例如$#开头的defhandle_charref(self, name):pass# 处理标签中的内容,比如<a href="http://www.baidu.com">baidu<a>defhandle_data(self, data):pass# 处理注释defhandle_comment(self, data):pass# 处理以<!开头的,比如<!DOCTYPE HTML>defhandle_decl(self, decl):pass# 处理特殊字符,例如&nbspdefhandle_entityref(self, name):pass# 处理<?instruction>defhandle_pi(self, data):pass

Douban movie content crawl

import requestsfrom html.parser import HTMLParser#classMovieParser(HTMLParser):"""
 电影解析器
"""def__init__(self): HTMLParser.__init__(self)
 self.moives = []
 defhandle_starttag(self, tag, attrs):def_attr(attrList, attrName):for attr in attrList:
 if attr[0] == attrName:
 return attr[1]
 returnNoneif tag == 'li'and _attr(attrs, 'data-title'):
 movie = {}
 movie['title'] = _attr(attrs, 'data-title')
 movie['score'] = _attr(attrs, 'data-score')
 if movie['score'] isNone:
 movie['score'] = "None" movie['director'] = _attr(attrs, 'data-director')
 movie['actors'] = _attr(attrs, 'data-actors')
 self.moives.append(movie)
 # print('{movie[title]} | {movie[score]} | {movie[director]} | {movie[actors]}'.format(movie=movie))deferror(self, message):pass##defmy_movies(url):"""
 网络请求
 :param url: 地址
 :return: 解析好的内容
""" mp = Nonetry:
 headers = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)'}
 response = requests.get(url, headers=headers)
 response.raise_for_status()
 mp = MovieParser()
 mp.feed(response.text)
 return mp.moives
 except:
 return print('发生异常')
 finally:
 if mp isnotNone:
 mp.close()##defsave_file(path, text):"""
 文本存储
 :param path: 存储路径
 :param text: 文本内容
 :return: None
"""with open(path, 'w', encoding='UTF-8') as file:
 file.write(text)##if __name__ == '__main__':
 url = "https://movie.douban.com/cinema/nowplaying/chongqing/" text = my_movies(url)
 save_file("d:/upload/movies.json", str(text).replace(''', '"'))



Copyright © 2011 Dowemo All rights reserved.    Creative Commons   AboutUs