设为首页 加入收藏

TOP

Python通过代理多线程抓取图片(一)
2014-11-24 03:11:44 来源: 作者: 【 】 浏览:1
Tags:Python 通过 代理 线程 图片
''')


#获取代理的类
class ProxyGet(threading.Thread):
def __init__(self,target):
threading.Thread.__init__(self)
self.target = target


def getProxy(self):
print "代理服务器目标网站: " + self.target
req = urllib2.urlopen(self.target)
result = req.read()
#print chardet.detect(result)
matchs = p.findall(result)
for row in matchs:
ip=row[0]
port =row[1]
port = map(lambda x:portdicts[x],port.split('+'))
port = ''.join(port)
agent = row[2]
addr = row[3].decode("cp936").encode("utf-8")
proxy = [ip,port,addr]
#print proxy
rawProxyList.append(proxy)


def run(self):
self.getProxy()


#检验代理的类
class ProxyCheck(threading.Thread):
def __init__(self,proxyList):
threading.Thread.__init__(self)
self.proxyList = proxyList
self.timeout = 5
self.testUrl = "http://www.baidu.com/"
self.testStr = "030173"


def checkProxy(self):
cookies = urllib2.HTTPCookieProcessor()
for proxy in self.proxyList:
proxyHandler = urllib2.ProxyHandler({"http" : r'http://%s:%s' %(proxy[0],proxy[1])})
#print r'http://%s:%s' %(proxy[0],proxy[1])
opener = urllib2.build_opener(cookies,proxyHandler)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0')]
#urllib2.install_opener(opener)
t1 = time.time()


try:
#req = urllib2.urlopen("http://www.baidu.com", timeout=self.timeout)
req = opener.open(self.testUrl, timeout=self.timeout)
#print "urlopen is ok...."
result = req.read()
#print "read html...."
timeused = time.time() - t1
pos = result.find(self.testStr)
#print "pos is %s" %pos


if pos > 1:
checkedProxyList.append((proxy[0],proxy[1],proxy[2],timeused))
#print "ok ip: %s %s %s %s" %(proxy[0],proxy[1],proxy[2],timeused)
else:
continue
except Exception,e:
#print e.message
continue


def run(self):
self.checkProxy()


#获取图片地址函数
def imgurlList(url_home):
global imgurl_list
home_page = urllib2.urlopen(url_home)
url_re = re.compile(r'

  • ')
    pic_re = re.compile(r' url_list = re.findall(url_re,home_page.read())
    for url in url_list:
    #print url_home+url
    url_page = urllib2.urlopen(url_home+url)
    for imgurlList in re.findall(pic_re,url_page.read()):
    imgurl_list.append(imgurlList)


    #下载图片的类
    class getPic(threading.Thread):
    def __init__(self,imgurl_list):
    threading.Thread.__init__(self)
    self.imgurl_list = imgurl_list
    self.timeout = 5
    def downloadimg(self):
    for imgurl in self.imgurl_list:
    pic_suffi

  • 2. 抓取一个网站的图片地址,多线程随机取一个代理服务器下载图片
    ps 图片网站地址:http://www.ivsky.com(测试只选择了有限的页面数) #!/usr/bin/env python
    #BLOG:blog.linuxeye.com
    #coding:utf-8


    import urllib2
    import re
    import threading
    import time
    import random


    rawProxyList = []
    checkedProxyList = []
    imgurl_list = []


    #抓取代理网站
    portdicts ={'v':"3",'m':"4",'a':"2",'l':"9",'q':"0",'b':"5",'i':"7",'w':"6",'r':"8",'c':"1"}
    targets = []
    for i in xrange(1,9):
    target = r"http://www.88181.com/proxy%d.html" % i
    targets.append(target)
    #print targets


    #抓取代理服务器正则
    p = re.compile(r'''

    (.+ )<SCRIPT type=text/java script>document.write\(":"\+(.+ )\)(.+ ).+ (.+ )
    首页 上一页 1 2 3 下一页 尾页 1/3/3
    】【打印繁体】【投稿】【收藏】 【推荐】【举报】【评论】 【关闭】 【返回顶部
    分享到: 
    上一篇Python爬虫多线程抓取代理服务器 下一篇Android过滤具体应用日志的脚本

    评论

    帐  号: 密码: (新用户注册)
    验 证 码:
    表  情:
    内  容:

    ·C++ 语言社区-CSDN社 (2025-12-24 17:48:24)
    ·CSDN问答专区社区-CS (2025-12-24 17:48:22)
    ·C++中`a = b = c`与` (2025-12-24 17:48:19)
    ·C语言结构体怎么直接 (2025-12-24 17:19:44)
    ·为什么指针作为c语言 (2025-12-24 17:19:41)