diff --git a/source/get_website.py b/source/get_website.py new file mode 100644 index 0000000000000000000000000000000000000000..49244e00ab8c2b2cdded7517c82e884e45b554b4 --- /dev/null +++ b/source/get_website.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +#coding:utf-8 +""" + Author: Fiht --<[url=mailto:fiht@qq.com]fiht@qq.com[/url]> + Purpose: 用æ¥èŽ·å–æ”»å‡»ç›®æ ‡ + Created: 2016å¹´04月20æ—¥ +""" +import requests +import re +import threading +from optparse import OptionParser +from bs4 import BeautifulSoup +try: + import sys + reload(sys) + sys.setdefaultencoding('utf8') +except Exception: #出错的肯定是python3 2333 + pass +count = 0 +#---------------------------------------------------------------------- +def fuck_href(url): + """""" + i = re.findall('_(.*?)\.html',url) + if i: + return i[0] +#---------------------------------------------------------------------- +def fuck_weight(text): + """""" + i = re.findall('themes/default/images/baidu/(.)\.gif',text) + if i: + return i[0] +#---------------------------------------------------------------------- +def shit(shitDemo,num_want,weight=0,file=None): + """return -1 if got a 404""" + global count + shit_list=[] + #print('shitDemo-->',shitDemo) + req = requests.get(shitDemo) + soup = BeautifulSoup(req.text,'lxml') + tag = soup.find(class_='listCentent') + if tag : + for i in tag.contents: + fuck_we = fuck_weight(i.__str__()) + if fuck_we: + if int(fuck_we) > weight: + count = count+1 + shit_list.append(fuck_href(i.__str__())) + return shit_list +#---------------------------------------------------------------------- +def myTest(): + """""" + for i in range(6): + req = requests.get('http://top.chinaz.com/tag/211_%d.html'%i) + soup = BeautifulSoup(req.text,'lxml') + for i in soup.findAll(class_='col-gray'): + if 'www' in i.string: + print(i.string.strip('www.')) +def run_get(url,page,num_want,file=None,weight=0): + """""" + global count + count = num_want + if file: + fil = open(file,'w+') + else: + fil=sys.stdout + fil.write('模æ¿url %s \n æƒé‡å¤§äºŽ%d的网站\n'%(url,weight)) + for i in range(2,page+1): + try: + lis = shit(url.replace('{page}',str(i)),num_want=num_want,file=fil,weight=weight) + for a in lis: + fil.write(a+'\n') + except KeyboardInterrupt: + print('æŽ¥æ”¶åˆ°ä¸æ–') + fil.close() + sys.exit(0) +#run_get('http://search.top.chinaz.com/Search.aspx?p={page}&url=%E4%B8%AD%E5%9B%BD',10,100) +if __name__=="__main__": + parser = OptionParser(' %prog args') + parser.add_option('-u','--url',dest='url',help='模æ¿url,如=http://top.chinaz.com/diqu/index_ZhongQing_2.html -> http://top.chinaz.com/diqu/index_ZhongQing_{page}.html å…¶ä¸{page}å³ä¸ºæ¯æ¬¡ç¿»é¡µçš„å˜é‡0->n') + parser.add_option('-p','--page',dest='page',type='int',default=10,help='一共有多少页') + parser.add_option('-n','--number',dest='num_want',default=9999999,type='int',help='想è¦èŽ·å¾—å¤šå°‘ä¸ªurl') + parser.add_option('-w','--weight',dest='weight',default=0,type='int',help='过滤æƒé‡,åªåˆ—出æƒé‡>w的网站,默认为0') + parser.add_option('-f','--file',dest='file',default=None,help='å†™å…¥åˆ°æ–‡ä»¶è€Œä¸æ˜¯æ‰“å°') + parser.add_option('-s','--search',dest='keyword',default=None,help='使用关键è¯èŽ·å–网站') + (options,args) = parser.parse_args() + if options.url:#利用urlæŠ“å– + if '{page}' in options.url: + run_get(options.url,options.page,options.num_want,file=options.file,weight=options.weight) + else: + print('模æ¿url䏿²¡æœ‰å‘现{page}å—æ®µ,请检查') + + elif(options.keyword):#åˆ©ç”¨å…³é”®è¯æŠ“å– + run_get('http://search.top.chinaz.com/Search.aspx?p={page}&url=%s'%options.keyword,options.page,num_want=options.num_want,file=options.file,weight=options.weight) + sys.exit(0) + print('-->>>--') + else: + parser.print_help() + \ No newline at end of file