urllib2 + urllib + re
os.makedir()
os.path.isfile()
os.path.isexist()
urllib.urlretrieve(item,filename) # save image to file
os.path.join('/root/hunhan/',file1)
import urllib2
import urllib
import re
import os
class HunHan:
def __init__(self,baselink):
self.baselink = baselink
def getPage(self,PageNum):
try:
url = self.baselink + str(PageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
except urllib2.URLError,e:
if hasattr(e,'reason'):
print 'link lost'
return None
def getPageIndex(self):
page = self.getPage(1)
pattern = re.compile('<li class="l_reply_num.*?<span.*?class="red">(.*?)</span>',re.S)
result = re.search(pattern,page)
if result:
print "totally %s pages" %result.group(1)
else:
print "get the page failed"
def mkdir(self,path):
path = path.strip()
isExist = os.path.exists(path)
if not isExist:
print 'create new path %s' %path
print os.makedirs(path)
return True
else:
print 'already have the path'
return False
def getPageUrl(self,PageNum): # parse new link from html file
page = self.getPage(PageNum)
pattern =re.compile('<img class="BDE_Image".*?src="(.*?)"',re.S)
result = re.findall(pattern,page)
self.mkdir('hunhan')
#dirt = os.path.normpath("/root/limiaomiao/")
#dir = r"C:\Usersbingmingc\Downloads\img"
x =0
if result:
for item in result:
file1 = item.split('/')[-1]
#picname = 'img' + str(index+1) +'.jpg'
filename = os.path.join('/root/hunhan/',file1)
urllib.urlretrieve(item,filename)
print os.path.isfile(filename)
print 'saving the picture to %s' %filename
#print filename + 'ok'
else:
print 'can not get the image url'
def getPageInfor(self,start,end):
for i in range(start,end+1):
self.getPageUrl(i)
baselink = 'http://tieba.baidu.com/p/4347239141?see_lz=1&pn='
hunhan = HunHan(baselink)
#print hunhan.getPage(1)
hunhan.getPageIndex()
#hunhan.getPageUrl(1)
hunhan.getPageInfor(2,3)