#!/usr/bin/env python # -*- coding:utf-8 -*- from database import * import urllib2 import re import sys import time import binascii import logging import os import smtplib from email.mime.text import MIMEText #要发给谁 mailto_list=["xupeng.js@gmail.com"] #设置服务器,用户名、口令以及邮箱的后缀 mail_host="smtp.163.com" mail_user="user" mail_pass="pwd" mail_postfix="163.com" def send_mail(to_list,sub,content): ''' to_list:发给谁 sub:主题 content:内容 send_mail("aaa@126.com","sub","content") ''' me=mail_user+"<"+mail_user+"@"+mail_postfix+">" msg = MIMEText(content) msg['Subject'] = sub msg['From'] = me msg['To'] = ";".join(to_list) try: s = smtplib.SMTP() s.connect(mail_host) s.login(mail_user,mail_pass) s.sendmail(me, to_list, msg.as_string()) s.close() return True except Exception, e: print str(e) return False arr = {} while(1): if __name__ == '__main__': db=Connection(host='192.168.1.188:3000',database='poxy',user='xupeng',password='xupeng') dbvalue = db.get("select id,poxyip from lists where used=0 and area != 'CN' order by createdAt desc limit 1") if dbvalue : #正在使用的代理地址 #daili = 'apex.bluedome.net:80' daili = dbvalue['poxyip'] daili_id = int(dbvalue['id']) else: message = '代理已经用光了~~' send_mail(mailto_list,"告警",message) print message + str(e) sys.exit() count = 0 next_exec_lasttime = 0 nowtime = int(time.time()) showtime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime( nowtime ) ) z = 0 if not arr.has_key(daili_id): arr[daili_id] = 0 dbresult = db.query('select id,value,time from type order by time asc') for row in dbresult: id = int(row['id']) value = row['value'] next_time = int(row['time']) if ( next_time <= nowtime ): url = 'http://www.cybersyndrome.net/%s.html'%(value) #通过代理访问页面,如果10次不成功就终止并发邮件 j = 0 while(1): proxy_support = urllib2.ProxyHandler({'http':daili}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) req = urllib2.Request(url) req.add_header('Host','www.cybersyndrome.net') req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1') req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_header('Accept-Encoding', 'gzip, deflate') req.add_header('Accept-Charset', 'utf-8;q=0.7,*;q=0.7') req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3') req.add_header('Referer', url) try: content = opener.open(req,timeout=20) break except urllib2.URLError,e: logging.basicConfig(filename = os.path.join(os.getcwd(), 'log.txt'), level = logging.DEBUG) logging.debug(' ' + str(daili) + ' [' + showtime + '] ' + url + '---->' + str(e)) logging.shutdown() j = j + 1 if ( j == 11 ): db.execute("update lists set used=1 where id=%s",daili_id) logging.error('处理id==>'+str(daili_id)) break except Exception,e: print e j = j + 1 if ( j == 11 ): db.execute("update lists set used=1 where id=%s",daili_id) logging.error('处理id==>'+str(daili_id)) break if ( j == 11 ): z = 1 break #获取网页内容并处理 try: result = content.read() except Exception,e: print e z = 1 break try: next_exec_time = re.search(r'<b>Updated:(.*)JST</b>',result).group(1).strip() except Exception: #print result arr[daili_id] = arr[daili_id] + 1 break next_exec_time = int(time.mktime(time.strptime(next_exec_time,'%Y/%m/%d %H:%M'))) if ( next_exec_lasttime < next_exec_time ): next_exec_lasttime = next_exec_time a = result.find('<ol>') b = result.find('</ol>') content = result[a:b] resultarr = content.split('</li><li>') for i in resultarr: value = re.search(r'<a\s*title="(.*)"\sonMouseOver(.*)>(.*)</a>',i) area = value.group(1).strip() poxyip = value.group(3).strip() formatpoxyip = binascii.crc32(poxyip) & 0xffffffff cateobj = db.get('select id from lists where formatpoxyip = %s limit 1',formatpoxyip) if not cateobj : count = count + 1 db.insert('lists',poxyip=poxyip,formatpoxyip=formatpoxyip,area=area,createdAt=nowtime,type=id) db.execute("update type set time=%s where id=%s",next_exec_time,id) if ( arr[daili_id] == 10 ): db.execute("update lists set used=1 where id=%s",daili_id) logging.error('处理id==>'+str(daili_id)) continue if ( z == 1): continue if (next_exec_lasttime == 0): sleeptime = 1200 else: sleeptime = int(next_exec_lasttime - nowtime) print "[ %s ] 抓取完毕,总共抓取%d条代理"%(showtime,count) db.close() #sys.exit() time.sleep(sleeptime)
python代理抓取,用代理抓取代理
最近需要用一些代理IP,故想用python去网站上抓取一些代理,发现国内很多代理都更新不是很及时,而且大多数都不好用,最后找到一个不错的网站,更新也很不错,但需要翻墙。所以采用通过代理去抓取代理,如果代理池用光了,便发送一封邮件来提醒coder。以下是具体的时间方式。
下一篇:php图片处理类