#!/usr/bin/env python # -*- coding:utf-8 -*- from BeautifulSoup import BeautifulSoup from database import * import urllib2 import re import binascii import time import sys while(1): k = 0 u = 0 url = "http://goodips.com/index.html" db=Connection(host='192.168.1.192:3306',database='agent',user='xupeng',password='xupeng') #befor5day = int(time.time())-5*24*60*60 #db.execute("delete from lists where updateAt < befor5day") opener = urllib2.build_opener() req = urllib2.Request(url) req.add_header('Host','goodips.com') req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1') req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_header('Accept-Encoding', 'gzip, deflate') req.add_header('Accept-Charset', 'utf-8;q=0.7,*;q=0.7') req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3') req.add_header('Referer', url) content = opener.open(req) soup = BeautifulSoup(content) pages = soup.findAll('div',{"class":"page"}) pages = pages.pop() lastPage = int(re.search(r'<a\s*title="最后页".*>(.*)</a>',str(pages)).group(1)) for i in range(1,lastPage+1): url = "http://goodips.com/index.html?pageid=%d"%(i) opener = urllib2.build_opener() req = urllib2.Request(url) req.add_header('Host','goodips.com') req.add_header('User-Agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:8.0.1) Gecko/20100101 Firefox/8.0.1') req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_header('Accept-Encoding', 'gzip, deflate') req.add_header('Accept-Charset', 'utf-8;q=0.7,*;q=0.7') req.add_header('Accept-Language', 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3') req.add_header('Referer', url) try: content = opener.open(req) except Exception,e: print e continue soup = BeautifulSoup(content) contents = soup.find('table',{"class":"ctable"}) trs = contents.findAll('tr') del(trs[0]) for tr in trs: tds = tr.findAll('td') ip = str(tds[0].string) + ':' + str(tds[1].string) types = str(tds[2].string) location = str(tds[3].string) #处理地区 area = 'fr' if( (location.find('市') != -1) or (location.find('省') != -1) ): area = 'cn' #处理响应时间 ttl = str(tds[5].string).replace('毫秒','').replace('钟','') if ( ttl.find('秒') != -1 ): ttl = float(ttl.replace('秒','.')) else: ttl = float('0.' + ttl) formatip = binascii.crc32(ip) & 0xffffffff nowTime = int(time.time()) agentinfo = db.query("select id from lists where formatip = %s limit 1",formatip) if agentinfo: u = u + 1 db.execute("update lists set ttl = %s,updateAt = %s where formatip = %s",ttl,nowTime,formatip) else: k = k + 1 db.insert('lists',ip=ip,formatip=formatip,type=types,location=location,area=area,ttl=ttl,createdAt=nowTime,updateAt=nowTime) db.commit() showtime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime( time.time() ) ) print '更新时间:%s'%(showtime) print '新增代理IP:%s'%(k) print '更新代理IP:%s'%(u) time.sleep(60)
使用python beautifulsoup抓取更新代理【每分钟】
采集网站上的代理ip,每分钟更新。使用beautifulsoup来练练手,PS:beautifulsoup真的很强大。以下是具体的实现方法。