爬取百度搜索
import requests
from bs4 import BeautifulSoup
from threading import Thread
class Mythread(Thread): # 继承thread 改写,使得其可以获得子线程的返回值
def __init__(self,url,func):
Thread.__init__(self)
self.url= url
self.func= func
def run(self):
self.result = self.func(self.url)
def get_result(self):
return self.result
class SpiderBaidu:
def __init__(self, high ='', key='messi', pn=10):
self.realkey = key # 真正关键字
self.key = "&wd=" + high + str(key) # 带高级搜索 如 inurl 等
self.url = 'http://www.baidu.com/s?&ie=utf-8' + self.key
self.pn = pn #页数、
self.urls = []
def get(self, url): # 获得百度页面的请求
try:
r = requests.get(url,timeout=3)
if r.status_code != 200:
print("[-] --- PAGE_NOT_200")
return None
return r.text
except:
print('[-] --- TIME OUT')
return None
def parse(self,content): # 解析页面
if content is None:
print("EMPTY CONTENT")
soup = BeautifulSoup(content,'lxml')
links = soup.find_all('a')
urls = set()
for i in links: #去重
urls.add(i.get('href'))
for url in list(urls):
if url is not None:
if (self.realkey and 'http') not in url:
continue
if "link?url" in url:
try:
self.urls.append(requests.get(url, timeout=5).url) # 获得真实页面
except:
pass
def craw(self): #发起请求
th = []
for i in range(self.pn):
print("NOW ---->开始多线程疯狂请求")
url = self.url + "&pn=" + str(i*10)
r = self.get(url)
t = Thread(target=self.parse,args=(r,))
t.start()
th.append(t)
for t in th:
t.join()
def print_url(self):
for i in range(len(self.urls)):
print("Craw : " + self.urls[i])
print("[+] ---- >OK!!!!!!!")
high = 'inurl: '
key = 'php?id=1'
spider = SpiderBaidu(high, key, 20)
spider.craw()
spider.print_url()
这一个是全站爬取
import requests
import threading
from bs4 import BeautifulSoup
from urllib.parse import urljoin
class Downloader: # 发起请求,获取内容
def get(self,url,content):
try:
r = requests.get(url,timeout=10)
if r.status_code != 200:
print('Something Error')
return None
content.append(r.text)
return content
except:
print("ERROR")
class UrlManager: # 管理url
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_url(self, url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
if urls is None or len(urls)==0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
class SpiderMain:
def __init__(self, root,domain,threadnum,http=False):
self.Tohttp = http # 把链接全部转为http
self.urls = UrlManager()
self.down = Downloader()
self.root = root
self.threadnum = threadnum
self.domain = domain
def judge(self, domain, url): # 判断链接的域名
if(url.find(domain) != -1):
return True
else:
return False
def parse(self, page_url, content): #解析页面
if content is None:
return
soup = BeautifulSoup(content, 'lxml')
news = self.get_new_urls(page_url, soup)
return news
def get_new_urls(self,page_url,soup): #从页面里面获得a标签列表,并组成新地址
new_urls = set()
links = soup.find_all('a')
for link in links:
new_url = link.get('href')
new_full_url = urljoin(page_url, new_url)
if self.Tohttp:
new_full_url.replace("https", "http")
if(self.judge(self.domain,new_full_url)):
new_urls.add(new_full_url)
return new_urls
def craw(self): # 控制流程,利用多线程发起请求
self.urls.add_new_url(self.root)
while self.urls.has_new_url():
content = []
th = []
for i in list(range(self.threadnum)):
if self.urls.has_new_url() is False:
break
new_url = self.urls.get_new_url()
print("craw: " + new_url)
t = threading.Thread(target=self.down.get, args=(new_url, content))
t.start()
th.append(t)
for t in th:
t.join()
for _str in content:
if _str is None:
print("Nothing here")
continue
new_urls = self.parse(new_url, _str)
self.urls.add_new_urls(new_urls)
def all(self):
print('[+] ALL ' + str(len(self.urls.old_urls)))
if __name__ == '__main__':
url= 'http://www.andseclab.cn'
domain = 'andseclab.cn'
spider = SpiderMain(url, domain, 10, False)
spider.craw()
print('[+] All ' + str(len(spider.urls.old_urls)))
相关