pipeline-case-study

Python 单机处理30G数据

from sgmllib import SGMLParser
class GetIdList(SGMLParser):
    def reset(self):
        self.IDlist = []
        self.flag = False
        self.getdata = False
        self.verbatim = 0
        SGMLParser.reset(self)

    def start_div(self, attrs):
        if self.flag == True:
            self.verbatim +=1 #进入子层div了,层数加1
            return
        for k,v in attrs:#遍历div的所有属性以及其值
            if k == 'class' and v == 'entry-content':#确定进入了<div class='entry-content'>
                self.flag = True
                return

    def end_div(self):#遇到</div>
        if self.verbatim == 0:
            self.flag = False
        if self.flag == True:#退出子层div了,层数减1
            self.verbatim -=1

    def start_p(self, attrs):
        if self.flag == False:
            return
        self.getdata = True

    def end_p(self):#遇到</p>
        if self.getdata:
            self.getdata = False

    def handle_data(self, text):#处理文本
        if self.getdata:
            self.IDlist.append(text)

    def printID(self):
        for i in self.IDlist:
            print i



lister = GetIdList()
lister.feed(the_page)
lister.printID()

path = "../items.txt"
count = 2
import json
with open(path, "r") as f:
    for line in f:
        if count == 0:
            break
        item = json.loads(line.decode('utf8'), 'utf8')
#         print item.keys()
#         print item['category']
#         print item['info_id']
#         print item['publish_time']
#         print item['title']
#         print item['fromdb']
#         print item['media']
#         print item['item_type']
#         print item['content']
#         print item['source']
#         print item['topic']
#         print item['tag']
#         print item['poi']
#         print item['source_id']
#         print item['position']
        for key, val in item.iteritems():
            print key, val
        count -= 1