最近需要用一些代理IP,故想用python去网站上抓取一些代理,发现国内很多代理都更新不是很及时,而且大多数都不好用,最后找到一个不错的网站,更新也很不错,但需要翻墙。所以采用通过代理去抓取代理,如果代理池用光了,便发送一封邮件来提醒coder。以下是具体的时间方式。
#!/usr/bin/env python
# -*- coding:utf-8 -*-
 
from database import *
import urllib2
import re
import sys
import time
import binascii
import logging
import os
import smtplib
from email.mime.text import MIMEText
 
#要发给谁
mailto_list=["xupeng.js@gmail.com"]
#设置服务器,用户名、口令以及邮箱的后缀
mail_host="smtp.163.com"
mail_user="user"
mail_pass="pwd"
mail_postfix="163.com"
 
def send_mail(to_list,sub,content):
    '''
    to_list:发给谁
    sub:主题
    content:内容
    send_mail("aaa@126.com","sub","content")
    '''
    me=mail_user+"<"+mail_user+"@"+mail_postfix+">"
    msg = MIMEText(content)
    msg['Subject'] = sub
    msg['From'] = me
    msg['To'] = ";".join(to_list)
    try:
        s = smtplib.SMTP()
        s.connect(mail_host)
        s.login(mail_user,mail_pass)
        s.sendmail(me, to_list, msg.as_string())
        s.close()
        return True
    except Exception, e:
        print str(e)
        return False
 
arr = {}
while(1):
 
	if __name__ == '__main__':
 
		db=Connection(host='192.168.1.188:3000',database='poxy',user='xupeng',password='xupeng')
 
		dbvalue = db.get("select id,poxyip from lists where used=0 and area != 'CN' order by createdAt desc limit 1")
		if dbvalue :
			#正在使用的代理地址
			#daili = 'apex.bluedome.net:80'
			daili = dbvalue['poxyip']
			daili_id = int(dbvalue['id'])
		else:
			message = '代理已经用光了~~'
			send_mail(mailto_list,"告警",message)
			print message + str(e)
			sys.exit()
 
		count = 0
		next_exec_lasttime = 0
		nowtime = int(time.time())
		showtime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime( nowtime ) )
		z = 0
 
		if not arr.has_key(daili_id):
			arr[daili_id] = 0
 
		dbresult = db.query('select id,value,time from type order by time asc')
		for row in dbresult:
			id = int(row['id'])
			value = row['value']
			next_time = int(row['time'])
 
			if ( next_time <= nowtime ):
				url = 'http://www.cybersyndrome.net/%s.html'%(value)
 
				#通过代理访问页面,如果10次不成功就终止并发邮件
				j = 0
				while(1):
					proxy_support = urllib2.ProxyHandler({'http':daili})
					opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
					urllib2.install_opener(opener)
					req = urllib2.Request(url)
					req.add_header('Host','www.cybersyndrome.net')
					req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1')
					req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
					req.add_header('Accept-Encoding', 'gzip, deflate')
					req.add_header('Accept-Charset', 'utf-8;q=0.7,*;q=0.7')
					req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3')
					req.add_header('Referer', url)
					try:
						content = opener.open(req,timeout=20)
						break
					except urllib2.URLError,e:
						logging.basicConfig(filename = os.path.join(os.getcwd(), 'log.txt'), level = logging.DEBUG)
						logging.debug(' ' + str(daili) + ' [' + showtime + '] ' + url + '---->' + str(e))
						logging.shutdown()
						j = j + 1
						if ( j == 11 ):
							db.execute("update lists set used=1 where id=%s",daili_id)
							logging.error('处理id==>'+str(daili_id))
							break
					except Exception,e:
						print e
						j = j + 1
						if ( j == 11 ):
							db.execute("update lists set used=1 where id=%s",daili_id)
							logging.error('处理id==>'+str(daili_id))
							break						
 
				if ( j == 11 ):
					z = 1
					break
				#获取网页内容并处理
				try:
					result = content.read()
				except Exception,e:
					print e
					z = 1
					break
 
				try:
					next_exec_time = re.search(r'<b>Updated:(.*)JST</b>',result).group(1).strip()
				except Exception:
					#print result
					arr[daili_id] = arr[daili_id] + 1
					break
 
				next_exec_time = int(time.mktime(time.strptime(next_exec_time,'%Y/%m/%d %H:%M')))
				if ( next_exec_lasttime < next_exec_time ):
					next_exec_lasttime = next_exec_time
				a = result.find('<ol>')
				b = result.find('</ol>')
				content = result[a:b]
 
				resultarr = content.split('</li><li>')
				for i in resultarr:
					value = re.search(r'<a\s*title="(.*)"\sonMouseOver(.*)>(.*)</a>',i)
					area = value.group(1).strip()
					poxyip = value.group(3).strip()
					formatpoxyip = binascii.crc32(poxyip) & 0xffffffff 
					cateobj = db.get('select id from lists where formatpoxyip = %s limit 1',formatpoxyip)
					if not cateobj :
						count = count + 1
						db.insert('lists',poxyip=poxyip,formatpoxyip=formatpoxyip,area=area,createdAt=nowtime,type=id)
 
				db.execute("update type set time=%s where id=%s",next_exec_time,id)
 
		if ( arr[daili_id] == 10 ):
			db.execute("update lists set used=1 where id=%s",daili_id)
			logging.error('处理id==>'+str(daili_id))
			continue
 
		if ( z == 1):
			continue
 
		if (next_exec_lasttime == 0):
			sleeptime = 1200
		else:	
			sleeptime = int(next_exec_lasttime - nowtime)
 
		print "[ %s ] 抓取完毕,总共抓取%d条代理"%(showtime,count)
		db.close()
		#sys.exit()
		time.sleep(sleeptime)