|
发表于 2013-11-29 21:43:28
|
显示全部楼层
3.3 ???????????????html
?????????blog.csdn./bagboy_taobao_/article/details/5582868 ??html???????article. (????????????utf8, ?????????). ????article., ??????????. ok, ?????????????? 3.3.1 ???±?????????html
??????????????(???±???)
......
?????????, ??(???б??
???±??????????????????|??????? ????????????????????????.*???.(?????????????: ??????div???????????????div,80shenghuoguan.com ??????????????????????????. ?????????????????????????) 3.3.2 ?????????????е???
???щ??????????щ??, ????????????????, ??????????????????l????????l??. ?????????????????е??html
(csdn???????img??????????) ????????, ?????????е?????img???, ?????????url, ????????????浽????, ????img?????url'????????????·??. ??img????????, ???????????????. ??????????????(?
3.3.3 ?????? #!/usr/bin/env python
# coding=utf-8
# python 2.7.3
# ???????????
# file: getarticle.py
import urllib2
import httplib
import re
class chygetarticle:
def parser(self, str, article):
# ??????±???
pattern = re.pile(r'|', re.m)
result = pattern.split(result[0])
article[0] = result[1]
article[0] =cacqe.com article[0].replace("\n\r", "") # ??????????????
article[0] = article[0].strip() # ??????????????
# ????????????
pattern = re.pile(r'.*?', re.s)
result = pattern.findall(str)
article[1] = result[0]
# ???????б?, ????img?????url?滻?????·??
pattern = re.pile(r'(?> f, ''
print >> f, '',
print >> f, article[0],
print >> f, ''
print >> f, ''
print >> f, ''
print >> f, ''
print >> f, article[0], # print??????????"????", ??????????????
print >> f, article[1]
print >> f, ''
print >> f, ''
# ??????
#for img in article[2]
# ??????
'''
3.4 ?????
????????б?, ?????????б?????????????????, ???????????????.
3.4.1 ???????
1. ????????б?(????浵?б?), ???????????(??????????????浵????). 2. ????????????. 3. ????????????, ???????????????article.txt?????.
3.4.2 ????????
#!/usr/bin/env python
# coding=utf-8
# python 2.7.3
import os
import getcategoryandmonth
import getarticlelist
import getarticle
import urllib2
import httplib
def gettypelist(host, blogname, list, type):
'''
????????б?
'''
conn = httplib.httpconnection(host)
# ?????ie????, ????csdn??????python??????
user_agent = 'mozilla/4.0 (patible; msie 5.5; windows nt)'
headersp = { 'user-agent' : user_agent }
conn.request(method = "get", url = "/" + blogname, headers = headersp)
r1 = conn.getresponse() # ??????
byte = r1.read() # ???html
str = byte.decode("utf8") # ????????utf8????, ?????????
my = getcategoryandmonth.chygetcategoryandmonth()
my.parser(byte, type, list)
def gettypearticlelist(host, articlelisturl, list):
'''
??????????????б?
'''
conn = httplib.httpconnection(host)
# ?????ie????, ????csdn??????python??????
user_agent = 'mozilla/4.0 (patible; msie 5.5; windows nt)'
headersp = { 'user-agent' : user_agent }
conn.request(method = "get", url = articlelisturl, headers = headersp)
r1 = conn.getresponse() # ??????
byte = r1.read() # ???html
str = byte.decode("utf8") # ????????utf8????, ?????????
my = getarticlelist.chygetarticlelist()
my.parser(byte, list)
def getarticlefun(host, articleurl, article):
'''
???????????
'''
conn = httplib.httpconnection(host)
# ?????ie????, ????csdn??????python??????
user_agent = 'mozilla/4.0 (patible; msie 5.5; windows nt)'
headersp = { 'user-agent' : user_agent }
conn.request(method = "get", url = articleurl, headers = headersp)
r1 = conn.getresponse() # ??????
byte = r1.read() # ???html
str = byte.decode("utf8") # ????????utf8????, ?????????
my = getarticle.chygetarticle()
my.parser(byte, article)
def validfilename(filename):
validfilename = filename.decode("utf8")
validfilename = validfilename.replace("/", "");
validfilename = validfilename.replace("?", "");
validfilename = validfilename.replace(":", "");
validfilename = validfilename.replace('"', "");
validfilename = validfilename.replace("'", "");
return validfilename
def downimg(imgurl, name):
conn = httplib.httpconnection("img.blog.csdn.")
# ?????ie????, ????csdn??????python??????
user_agent = 'mozilla/4.0 (patible; msie 5.5; windows nt)'
headersp = { 'user-agent' : user_agent }
conn.request(method = "get", url = imgurl.replace("img.blog.csdn.", ""), headers = headersp)
r1 = conn.getresponse() # ??????
data = r1.read() # ???html
f = file(name,"wb")
f.write(data)
f.close()
if __name__ == '__main__':
# ?????????
host = "blog.csdn."
blogname = "bagboy_taobao_"
blogdir = "f:" + os.sep + blogname # f:\ ????
os.mkdir(blogdir)
# ????????б?
listtype = []
gettypelist(host, blogname, listtype, 1)
# ?????????????
for listtypeitem in listtype:
typedir = blogdir + os.sep + listtypeitem[1]
os.mkdir(typedir)
listarticle = []
gettypearticlelist(host, listtypeitem[0], listarticle)
for listarticleitem in listarticle:
article = [none, none, []] # ????, ????, ???б?
getarticlefun(host, listarticleitem, article)
articledir = typedir + os.sep + listarticleitem.replace("/" + blogname + "/article/details/", "") + "_" + validfilename(article[0])
# ???????????????????????
os.mkdir(articledir)
title = articledir + os.sep + "article."
# print(title)
f = open(title, 'w');
print >> f, ''
print >> f, '',
print >> f, article[0],
print >> f,521bag.com ''
print >> f, ''
print >> f, ''
print >> f, ''
print >> f, article[0],
print >> f, article[1]
print >> f, ''
print >> f, ''
# ?????
for imgitem in article[2]:
name = articledir + os.sep + imgitem[1]
downimg(imgitem[0], name)
????????v2?汾???, ??????щ???, ??????щ?????????encode, ??щ?????.
??. с??
1. ??????????????????????????4??html, ???????????????????, ??????????????щ?????????????????, ?????beautifulsoup???, ????????????????????????????????. ???и???. 2. ???????????????????????ι?????????????, ??????, ч?????, ???????????щ, ?????...... 3. ?????????????????.
|
|