Powered by GitBook

urllib2 + urllib + re

os.makedir()

os.path.isfile()

os.path.isexist()

urllib.urlretrieve(item,filename) # save image to file

os.path.join('/root/hunhan/',file1)

import urllib2
import urllib
import re
import os

class HunHan:
    def __init__(self,baselink):
        self.baselink = baselink

    def getPage(self,PageNum):
        try:
            url = self.baselink + str(PageNum)
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            return response.read()
        except urllib2.URLError,e:
            if hasattr(e,'reason'):
                print 'link lost'
                return None

    def getPageIndex(self):
        page = self.getPage(1)
        pattern = re.compile('<li class="l_reply_num.*?<span.*?class="red">(.*?)</span>',re.S)
        result = re.search(pattern,page)

        if result:
            print "totally %s pages" %result.group(1)
        else:
            print "get the page failed"

    def mkdir(self,path):
        path = path.strip()
        isExist = os.path.exists(path)
        if not isExist:
            print 'create new path %s' %path
            print os.makedirs(path)
            return True
        else:
            print 'already have the path'
            return False


    def getPageUrl(self,PageNum):  # parse new link from html file
        page = self.getPage(PageNum)
        pattern =re.compile('<img class="BDE_Image".*?src="(.*?)"',re.S)
        result = re.findall(pattern,page)
        self.mkdir('hunhan')
        #dirt = os.path.normpath("/root/limiaomiao/")
        #dir = r"C:\Usersbingmingc\Downloads\img"
        x =0
        if result:
            for item in result:
                file1 = item.split('/')[-1]
                #picname = 'img' + str(index+1) +'.jpg'
                filename = os.path.join('/root/hunhan/',file1)
                urllib.urlretrieve(item,filename)
                print os.path.isfile(filename)
                print 'saving the picture to %s' %filename
                #print filename + 'ok'
        else:
            print 'can not get the image url'

    def getPageInfor(self,start,end):
        for i in range(start,end+1):
            self.getPageUrl(i)

baselink = 'http://tieba.baidu.com/p/4347239141?see_lz=1&pn='
hunhan = HunHan(baselink)
#print hunhan.getPage(1)
hunhan.getPageIndex()
#hunhan.getPageUrl(1)
hunhan.getPageInfor(2,3)

results matching ""

No results matching ""