超大xml解析


from xml import sax

class MovieHandler(sax.ContentHandler):
    def __init__(self):
        # 初始化数据,并增加一个当前数据
        self.CurrentData = ""
        self.type = ""
        self.format = ""
        self.year = ""
        self.rating = ""
        self.stars = ""
        self.description = ""

    # 文档启动的时候调用
    def startDocument(self):
        self.write_file = open("result_txt", "w")
        print('XML开始解析中...')

    # 元素开始事件处理
    def startElement(self, name, attrs):
        self.CurrentData = name
        # if self.CurrentData == 'text':
            # print('*********text_start*********')
            # title = attrs['title']
            # print('Title:{0}'.format(title))

    # 内容事件处理
    def characters(self, content):
        # if self.CurrentData == "type":
        #     self.type = content
        # elif self.CurrentData == "format":
        #     self.format = content
        # elif self.CurrentData == "year":
        #     self.year = content
        # elif self.CurrentData == "rating":
        #     self.rating = content
        # elif self.CurrentData == "stars":
        #     self.stars = content
        # elif self.CurrentData == "description":
        #     self.description = content
        if self.CurrentData == "text":
            self.format += content

    # 元素结束事件处理
    def endElement(self, name):
        # if self.CurrentData == 'type':
        #     print('Type:{0}'.format(self.type))
        # elif self.CurrentData == 'format':
        #     print('Format:{0}'.format(self.format))
        # elif self.CurrentData == 'year':
        #     print('Year:{0}'.format(self.year))
        # elif self.CurrentData == 'rating':
        #     print('Rating:{0}'.format(self.rating))
        # elif self.CurrentData == 'stars':
        #     print('Stars:{0}'.format(self.stars))
        # elif self.CurrentData == 'description':
        #     print('Description:{0}'.format(self.description))
        if self.CurrentData == "text":
            if self.format.strip():
                self.write_file.write(self.format)
                # print(self.format)
                # print('*********text_end*********')
        self.CurrentData = ""

    # 文档结束的时候调用
    def endDocument(self):
        self.write_file.close()
        print('XML文档解析结束!')


if __name__ == '__main__':
    handler = MovieHandler()
    parser = sax.make_parser()
    # parser.setFeature(sax.handler.feature_namespaces, 0)
    parser.setContentHandler(handler)
    parser.parse("jawiki-20190901-pages-articles-multistream.xml")