#!/usr/bin/env python
# -*- coding:utf-8 -*-
from BeautifulSoup import BeautifulSoup
from database import *
import urllib2
import re
import binascii
import time
import sys
while(1):
k = 0
u = 0
url = "http://goodips.com/index.html"
db=Connection(host='192.168.1.192:3306',database='agent',user='xupeng',password='xupeng')
#befor5day = int(time.time())-5*24*60*60
#db.execute("delete from lists where updateAt < befor5day")
opener = urllib2.build_opener()
req = urllib2.Request(url)
req.add_header('Host','goodips.com')
req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1')
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_header('Accept-Encoding', 'gzip, deflate')
req.add_header('Accept-Charset', 'utf-8;q=0.7,*;q=0.7')
req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3')
req.add_header('Referer', url)
content = opener.open(req)
soup = BeautifulSoup(content)
pages = soup.findAll('div',{"class":"page"})
pages = pages.pop()
lastPage = int(re.search(r'<a\s*title="最后页".*>(.*)</a>',str(pages)).group(1))
for i in range(1,lastPage+1):
url = "http://goodips.com/index.html?pageid=%d"%(i)
opener = urllib2.build_opener()
req = urllib2.Request(url)
req.add_header('Host','goodips.com')
req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1')
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
req.add_header('Accept-Encoding', 'gzip, deflate')
req.add_header('Accept-Charset', 'utf-8;q=0.7,*;q=0.7')
req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3')
req.add_header('Referer', url)
try:
content = opener.open(req)
except Exception,e:
print e
continue
soup = BeautifulSoup(content)
contents = soup.find('table',{"class":"ctable"})
trs = contents.findAll('tr')
del(trs[0])
for tr in trs:
tds = tr.findAll('td')
ip = str(tds[0].string) + ':' + str(tds[1].string)
types = str(tds[2].string)
location = str(tds[3].string)
#处理地区
area = 'fr'
if( (location.find('市') != -1) or (location.find('省') != -1) ):
area = 'cn'
#处理响应时间
ttl = str(tds[5].string).replace('毫秒','').replace('钟','')
if ( ttl.find('秒') != -1 ):
ttl = float(ttl.replace('秒','.'))
else:
ttl = float('0.' + ttl)
formatip = binascii.crc32(ip) & 0xffffffff
nowTime = int(time.time())
agentinfo = db.query("select id from lists where formatip = %s limit 1",formatip)
if agentinfo:
u = u + 1
db.execute("update lists set ttl = %s,updateAt = %s where formatip = %s",ttl,nowTime,formatip)
else:
k = k + 1
db.insert('lists',ip=ip,formatip=formatip,type=types,location=location,area=area,ttl=ttl,createdAt=nowTime,updateAt=nowTime)
db.commit()
showtime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime( time.time() ) )
print '更新时间:%s'%(showtime)
print '新增代理IP:%s'%(k)
print '更新代理IP:%s'%(u)
time.sleep(60)使用python beautifulsoup抓取更新代理【每分钟】
采集网站上的代理ip,每分钟更新。使用beautifulsoup来练练手,PS:beautifulsoup真的很强大。以下是具体的实现方法。