Python 单机处理30G数据
from sgmllib import SGMLParser class GetIdList(SGMLParser): def reset(self): self.IDlist = [] self.flag = False self.getdata = False self.verbatim = 0 SGMLParser.reset(self) def start_div(self, attrs): if self.flag == True: self.verbatim +=1 #进入子层div了,层数加1 return for k,v in attrs:#遍历div的所有属性以及其值 if k == 'class' and v == 'entry-content':#确定进入了<div class='entry-content'> self.flag = True return def end_div(self):#遇到</div> if self.verbatim == 0: self.flag = False if self.flag == True:#退出子层div了,层数减1 self.verbatim -=1 def start_p(self, attrs): if self.flag == False: return self.getdata = True def end_p(self):#遇到</p> if self.getdata: self.getdata = False def handle_data(self, text):#处理文本 if self.getdata: self.IDlist.append(text) def printID(self): for i in self.IDlist: print i lister = GetIdList() lister.feed(the_page) lister.printID() path = "../items.txt" count = 2 import json with open(path, "r") as f: for line in f: if count == 0: break item = json.loads(line.decode('utf8'), 'utf8') # print item.keys() # print item['category'] # print item['info_id'] # print item['publish_time'] # print item['title'] # print item['fromdb'] # print item['media'] # print item['item_type'] # print item['content'] # print item['source'] # print item['topic'] # print item['tag'] # print item['poi'] # print item['source_id'] # print item['position'] for key, val in item.iteritems(): print key, val count -= 1

