爬取某pygame技术博客
By:Roy.LiuLast updated:2016-06-30
听说lxml 性能比 beautsoup 强,所以测试下, 玩玩这个东西怎么样
需要用到的包: https://pypi.python.org/pypi/lxml, 这里下载适合自己的版本, 然后跑下面的测试代码
需要用到的包: https://pypi.python.org/pypi/lxml, 这里下载适合自己的版本, 然后跑下面的测试代码
# -*- coding: utf-8 -*- from urllib2 import urlopen,Request import urllib from lxml import * import lxml.html as HTML import time def error(txt): with open("../it/error.txt","a") as f: f.write(txt + '\n') def con(url,count=4): try: req = Request(url) req.add_header('Referer','http://www.baidu.com') req.add_header('User-Agent','Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') res = urlopen(req,timeout = 20) page = res.read() res.close() #dom = HTML.document_fromstring(page) return page except Exception,e: if count >= 10: print e error(url) else: count += 1 time.sleep(1) return con(url,count) def menu(url): page = con(url) dom = HTML.document_fromstring(page) path = "//h5/a" node = dom.xpath(path) for n in node: dic = {} dic['title'] = n.text_content() dic['url'] = "http:" + n.get("href") if dic['title'] and dic['url']: yield dic def save(title,content): with open('../it/'+unicode(title)+'.html','w') as f: f.write(content) def blog(): prev = menu("http://eyehere.net/2011/python-pygame-novice\ -professional-index/") for dic in prev: title = dic.get("title",'') url = dic.get("url",'') page = con(url) save(title,page) print "saved ",unicode(title) if __name__ == "__main__": ## try: blog() ## except Exception,e: ## print e
From:一号门
Previous:Spring data mongo like 查询
COMMENTS