1.
How to fix a : TypeError 'tuple' object does not support item assignment
tuple is immutable, cannot iterate for item ,can transform to list with command list(tuple)
2.
BeautifulSoup(html_text,from_encoding="utf-8")
# encoding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
Python: Check if a File or Directory Exists
os.path.isfile('./file.txt')
os.path.isdir('./file.txt')
os.path.exists('./file.txt')
Find current directory and file's directory
import os
cwd = os.getcwd()
import re
import time
import urllib
import os
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MaoYan:
def __init__(self,baseurl):
self.baseurl = baseurl
def getPageUrl(self,PageNum):
offset = str(10*(PageNum-1))
try:
url = self.baseurl + offset
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read().decode('utf-8')
except urllib2.URLError,e:
if hasattr(e,'reason'):
print 'link lost'
return None
def makedir(self,path):
path = path.strip()
isExist = os.path.exists(path)
if not isExist:
print 'create new file %s' %path
os.makedirs(path)
return True
else:
print 'already have the path'
return False
def writefile(self,content,listflag):
if listflag == True:
contentlist = list(content)
print contentlist[0]
contentlist[0] = 'rank is : ' + content[0]
#print contentlist[0]
contentlist[1] = 'image url is : ' + content[1]
contentlist[2] = 'title is : ' + content[2]
contentlist[3] = 'actor are : ' + content[3]
contentlist[4] = 'the score is ' + content[4] + content[5]
with open('result.txt','a') as f:
for item in contentlist:
f.write(item + '\n')
else:
with open('result.txt','a') as f:
f.write(content)
f.close()
def getPageInfo(self,PageNum):
page = self.getPageUrl(PageNum)
#pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?<img data-src="(.*?)".*?name"><a.*?>(.*?)</a>.*?star">(.*?)</p>.*?integer">(.*?)</i>.*?fraction">(.*?)</i>',re.S)
result = re.findall(pattern,page)
return result
def parsePage(self,PageNum):
result = self.getPageInfo(PageNum)
for item in result:
self.writefile(item,True)
self.writefile('\n----------------------------------------------------------------------------\n',False)
self.makedir(item[2])
os.chdir(item[2])
filename = item[2] + '.jpg'
urllib.urlretrieve(item[1],filename)
print 'saving picture %s ' %filename
os.chdir('../')
def loadPages(self,start,end):
# start -= start
# end -= end
for i in range(start,end+1):
print 'downloading the %sth page film'%i
self.parsePage(i)
# time.sleep(3)
baseurl = 'http://maoyan.com/board/4?offset='
maoyan = MaoYan(baseurl)
maoyan.loadPages(1,3)