采集网站上的代理ip,每分钟更新。使用beautifulsoup来练练手,PS:beautifulsoup真的很强大。以下是具体的实现方法。
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from BeautifulSoup import BeautifulSoup
from database import *
import urllib2
import re
import binascii
import time
import sys

while(1):
	k = 0
	u = 0
	url = "http://goodips.com/index.html"
	db=Connection(host='192.168.1.192:3306',database='agent',user='xupeng',password='xupeng')
	
	#befor5day = int(time.time())-5*24*60*60
	#db.execute("delete from lists where updateAt < befor5day")
	
	opener = urllib2.build_opener()
	req = urllib2.Request(url)
	req.add_header('Host','goodips.com')
	req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1')
	req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
	req.add_header('Accept-Encoding', 'gzip, deflate')
	req.add_header('Accept-Charset', 'utf-8;q=0.7,*;q=0.7')
	req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3')
	req.add_header('Referer', url)
	content = opener.open(req)
	soup = BeautifulSoup(content)
	
	pages = soup.findAll('div',{"class":"page"})
	pages = pages.pop()
	lastPage = int(re.search(r'<a\s*title="最后页".*>(.*)</a>',str(pages)).group(1))
	for i in range(1,lastPage+1):
		url = "http://goodips.com/index.html?pageid=%d"%(i)
		opener = urllib2.build_opener()
		req = urllib2.Request(url)
		req.add_header('Host','goodips.com')
		req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1')
		req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
		req.add_header('Accept-Encoding', 'gzip, deflate')
		req.add_header('Accept-Charset', 'utf-8;q=0.7,*;q=0.7')
		req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3')
		req.add_header('Referer', url)
                try:
		    content = opener.open(req)
                except Exception,e:
                    print e
                    continue
		soup = BeautifulSoup(content)
		contents = soup.find('table',{"class":"ctable"})
		trs = contents.findAll('tr')
		del(trs[0])
		for tr in trs:
			tds = tr.findAll('td')
			ip = str(tds[0].string) + ':' + str(tds[1].string)
			types = str(tds[2].string)
			location = str(tds[3].string)
			#处理地区
			area = 'fr'
			if( (location.find('市') != -1) or (location.find('省') != -1) ):
				area = 'cn'
			#处理响应时间
			ttl = str(tds[5].string).replace('毫秒','').replace('钟','')	
			if ( ttl.find('秒') != -1 ):
				ttl = float(ttl.replace('秒','.'))				
			else:
				ttl = float('0.' + ttl)
			
			formatip = binascii.crc32(ip) & 0xffffffff
			nowTime = int(time.time())
			
			agentinfo = db.query("select id from lists where formatip = %s limit 1",formatip)
			if agentinfo:
				u = u + 1
				db.execute("update lists set ttl = %s,updateAt = %s where formatip = %s",ttl,nowTime,formatip)
			else:
				k = k + 1
				db.insert('lists',ip=ip,formatip=formatip,type=types,location=location,area=area,ttl=ttl,createdAt=nowTime,updateAt=nowTime)
				
	db.commit()
	showtime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime( time.time() ) )
	print '更新时间:%s'%(showtime)
	print '新增代理IP:%s'%(k)
	print '更新代理IP:%s'%(u)
	time.sleep(60)