nt = urllib2.urlopen(url).read() html = BeautifulSoup(content) fetch_resource = Resource(url, None, str(html.find('body'))[0:9999], 1) fetch_resource.updateContentAndStatus() aLinks = html.find_all('a') print 'aLinks %s' %aLinks for aLink in aLinks : href = aLink.get('href') a_text = CodeHelper.encodeContent(aLink.get_text()) print 'href %s text %s' %(href, a_text) subResource = Resource(href, a_text, '', 0) subResource.insert() def execute(): urls = ['http://www.kuwo.cn', 'http://www.1ting.com/', 'http://www.kugou.com/', 'http://y.**.com/'] for url in urls : resource = Resource(url, None, 0) resource.insert() start = time.time() resource_manager = ResourceManager(20, 4) resource_manager.wait_for_complete() end = time.time() print "cost all time: %s" % (end-start)
if __name__ == '__main__': execute()
|