#!/usr/bin/env python
# -*- coding:utf-8 -*-
from database import *
import urllib2
import re
import sys
import time
import binascii
import logging
import os
import smtplib
from email.mime.text import MIMEText
#要发给谁
mailto_list=["xupeng.js@gmail.com"]
#设置服务器,用户名、口令以及邮箱的后缀
mail_host="smtp.163.com"
mail_user="user"
mail_pass="pwd"
mail_postfix="163.com"
def send_mail(to_list,sub,content):
'''
to_list:发给谁
sub:主题
content:内容
send_mail("aaa@126.com","sub","content")
'''
me=mail_user+"<"+mail_user+"@"+mail_postfix+">"
msg = MIMEText(content)
msg['Subject'] = sub
msg['From'] = me
msg['To'] = ";".join(to_list)
try:
s = smtplib.SMTP()
s.connect(mail_host)
s.login(mail_user,mail_pass)
s.sendmail(me, to_list, msg.as_string())
s.close()
return True
except Exception, e:
print str(e)
return False
arr = {}
while(1):
if __name__ == '__main__':
db=Connection(host='192.168.1.188:3000',database='poxy',user='xupeng',password='xupeng')
dbvalue = db.get("select id,poxyip from lists where used=0 and area != 'CN' order by createdAt desc limit 1")
if dbvalue :
#正在使用的代理地址
#daili = 'apex.bluedome.net:80'
daili = dbvalue['poxyip']
daili_id = int(dbvalue['id'])
else:
message = '代理已经用光了~~'
send_mail(mailto_list,"告警",message)
print message + str(e)
sys.exit()
count = 0
next_exec_lasttime = 0
nowtime = int(time.time())
showtime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime( nowtime ) )
z = 0
if not arr.has_key(daili_id):
arr[daili_id] = 0
dbresult = db.query('select id,value,time from type order by time asc')
for row in dbresult:
id = int(row['id'])
value = row['value']
next_time = int(row['time'])
if ( next_time <= nowtime ):
url = 'http://www.cybersyndrome.net/%s.html'%(value)
#通过代理访问页面,如果10次不成功就终止并发邮件
j = 0
while(1):
proxy_support = urllib2.ProxyHandler({'http':daili})
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
req = urllib2.Request(url)
req.add_header('Host','www.cybersyndrome.net')
req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1')
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_header('Accept-Encoding', 'gzip, deflate')
req.add_header('Accept-Charset', 'utf-8;q=0.7,*;q=0.7')
req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3')
req.add_header('Referer', url)
try:
content = opener.open(req,timeout=20)
break
except urllib2.URLError,e:
logging.basicConfig(filename = os.path.join(os.getcwd(), 'log.txt'), level = logging.DEBUG)
logging.debug(' ' + str(daili) + ' [' + showtime + '] ' + url + '---->' + str(e))
logging.shutdown()
j = j + 1
if ( j == 11 ):
db.execute("update lists set used=1 where id=%s",daili_id)
logging.error('处理id==>'+str(daili_id))
break
except Exception,e:
print e
j = j + 1
if ( j == 11 ):
db.execute("update lists set used=1 where id=%s",daili_id)
logging.error('处理id==>'+str(daili_id))
break
if ( j == 11 ):
z = 1
break
#获取网页内容并处理
try:
result = content.read()
except Exception,e:
print e
z = 1
break
try:
next_exec_time = re.search(r'<b>Updated:(.*)JST</b>',result).group(1).strip()
except Exception:
#print result
arr[daili_id] = arr[daili_id] + 1
break
next_exec_time = int(time.mktime(time.strptime(next_exec_time,'%Y/%m/%d %H:%M')))
if ( next_exec_lasttime < next_exec_time ):
next_exec_lasttime = next_exec_time
a = result.find('<ol>')
b = result.find('</ol>')
content = result[a:b]
resultarr = content.split('</li><li>')
for i in resultarr:
value = re.search(r'<a\s*title="(.*)"\sonMouseOver(.*)>(.*)</a>',i)
area = value.group(1).strip()
poxyip = value.group(3).strip()
formatpoxyip = binascii.crc32(poxyip) & 0xffffffff
cateobj = db.get('select id from lists where formatpoxyip = %s limit 1',formatpoxyip)
if not cateobj :
count = count + 1
db.insert('lists',poxyip=poxyip,formatpoxyip=formatpoxyip,area=area,createdAt=nowtime,type=id)
db.execute("update type set time=%s where id=%s",next_exec_time,id)
if ( arr[daili_id] == 10 ):
db.execute("update lists set used=1 where id=%s",daili_id)
logging.error('处理id==>'+str(daili_id))
continue
if ( z == 1):
continue
if (next_exec_lasttime == 0):
sleeptime = 1200
else:
sleeptime = int(next_exec_lasttime - nowtime)
print "[ %s ] 抓取完毕,总共抓取%d条代理"%(showtime,count)
db.close()
#sys.exit()
time.sleep(sleeptime)python代理抓取,用代理抓取代理
最近需要用一些代理IP,故想用python去网站上抓取一些代理,发现国内很多代理都更新不是很及时,而且大多数都不好用,最后找到一个不错的网站,更新也很不错,但需要翻墙。所以采用通过代理去抓取代理,如果代理池用光了,便发送一封邮件来提醒coder。以下是具体的时间方式。
下一篇:php图片处理类