1 # coding:utf8
2 import re
3 from HTMLParser import HTMLParser
4
5 from bs4 import BeautifulSoup
6 import urlparse
7
8 import sys
9 reload(sys)
10 sys.setdefaultencoding('utf-8')
11
12 class ParserManager(HTMLParser):
13
14 def __init__(self):
15 HTMLParser.__init__(self)
16 self.links = []
17
18 def handle_starttag(self, tag, attrs):
19 # print "Encountered the beginning of a %s tag" % tag
20 if tag == 'img' or tag == "script":
21 for (variable, value) in attrs:
22 if variable == "src" or variable == "href":
23 self.links.append(value)
24 if tag == "link":
25 dic = dict(attrs)
26 if dic['rel'] == "stylesheet":
27 self.links.append(dic['href'])
28
29 def parse(self, page_url, html_cont):
30 if page_url is None or html_cont is None:
31 return
32 soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
33 new_data = sel