from xml import sax
class MovieHandler(sax.ContentHandler):
def __init__(self):
# 初始化数据,并增加一个当前数据
self.CurrentData = ""
self.type = ""
self.format = ""
self.year = ""
self.rating = ""
self.stars = ""
self.description = ""
# 文档启动的时候调用
def startDocument(self):
self.write_file = open("result_txt", "w")
print('XML开始解析中...')
# 元素开始事件处理
def startElement(self, name, attrs):
self.CurrentData = name
# if self.CurrentData == 'text':
# print('*********text_start*********')
# title = attrs['title']
# print('Title:{0}'.format(title))
# 内容事件处理
def characters(self, content):
# if self.CurrentData == "type":
# self.type = content
# elif self.CurrentData == "format":
# self.format = content
# elif self.CurrentData == "year":
# self.year = content
# elif self.CurrentData == "rating":
# self.rating = content
# elif self.CurrentData == "stars":
# self.stars = content
# elif self.CurrentData == "description":
# self.description = content
if self.CurrentData == "text":
self.format += content
# 元素结束事件处理
def endElement(self, name):
# if self.CurrentData == 'type':
# print('Type:{0}'.format(self.type))
# elif self.CurrentData == 'format':
# print('Format:{0}'.format(self.format))
# elif self.CurrentData == 'year':
# print('Year:{0}'.format(self.year))
# elif self.CurrentData == 'rating':
# print('Rating:{0}'.format(self.rating))
# elif self.CurrentData == 'stars':
# print('Stars:{0}'.format(self.stars))
# elif self.CurrentData == 'description':
# print('Description:{0}'.format(self.description))
if self.CurrentData == "text":
if self.format.strip():
self.write_file.write(self.format)
# print(self.format)
# print('*********text_end*********')
self.CurrentData = ""
# 文档结束的时候调用
def endDocument(self):
self.write_file.close()
print('XML文档解析结束!')
if __name__ == '__main__':
handler = MovieHandler()
parser = sax.make_parser()
# parser.setFeature(sax.handler.feature_namespaces, 0)
parser.setContentHandler(handler)
parser.parse("jawiki-20190901-pages-articles-multistream.xml")
Q.E.D.