利用PYTHON爬取百度搜索和全站爬取

爬取百度搜索

  • 可用于简单的获取批量网址
  • 配合搜索语法
import requests
from bs4 import BeautifulSoup
from threading import Thread

class Mythread(Thread):    # 继承thread  改写,使得其可以获得子线程的返回值
    def __init__(self,url,func):
        Thread.__init__(self)
        self.url= url
        self.func= func

    def run(self):
        self.result = self.func(self.url)

    def get_result(self):
        return self.result


class SpiderBaidu:
    def __init__(self, high ='', key='messi', pn=10):
        self.realkey = key   # 真正关键字
        self.key = "&wd=" + high + str(key)  # 带高级搜索 如 inurl 等
        self.url = 'http://www.baidu.com/s?&ie=utf-8' + self.key
        self.pn = pn  #页数、
        self.urls = []

    def get(self, url):  # 获得百度页面的请求
        try:
            r = requests.get(url,timeout=3)
            if r.status_code != 200:
                print("[-] --- PAGE_NOT_200")
                return None
            return r.text
        except:
            print('[-] --- TIME OUT')
            return None

    def parse(self,content):  # 解析页面
        if content is None:
            print("EMPTY CONTENT")
        soup = BeautifulSoup(content,'lxml')
        links = soup.find_all('a')
        urls = set()
        for i in links:  #去重
            urls.add(i.get('href'))
        for url in list(urls):
            if url is not None:
                if (self.realkey and 'http') not in url:
                    continue
                if "link?url" in url:
                    try:
                        self.urls.append(requests.get(url, timeout=5).url)  # 获得真实页面
                    except:
                        pass

    def craw(self):  #发起请求
        th = []
        for i in range(self.pn):
            print("NOW ---->开始多线程疯狂请求")
            url = self.url + "&pn=" + str(i*10)
            r = self.get(url)
            t = Thread(target=self.parse,args=(r,))
            t.start()
            th.append(t)
        for t in th:
            t.join()

    def print_url(self):
        for i in range(len(self.urls)):
            print("Craw : " + self.urls[i])
        print("[+] ---- >OK!!!!!!!")


high = 'inurl: '
key = 'php?id=1'
spider = SpiderBaidu(high, key,  20)
spider.craw()
spider.print_url()

这一个是全站爬取

  • 也就是爬取同个域名下的所有网站
  • 可用于做漏洞扫描器
import requests
import threading
from bs4 import BeautifulSoup
from urllib.parse import urljoin

class Downloader:   # 发起请求,获取内容
    def get(self,url,content):
        try:
            r = requests.get(url,timeout=10)
            if r.status_code != 200:
                print('Something Error')
                return None
            content.append(r.text)
            return content
        except:
            print("ERROR")


class UrlManager: # 管理url
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()

    def add_new_url(self, url):
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)

    def add_new_urls(self,urls):
        if urls is None or len(urls)==0:
            return
        for url in urls:
             self.add_new_url(url)

    def has_new_url(self):
        return len(self.new_urls) != 0

    def get_new_url(self):
         new_url = self.new_urls.pop()
         self.old_urls.add(new_url)
         return new_url


class SpiderMain:
    def __init__(self, root,domain,threadnum,http=False):
        self.Tohttp = http   #  把链接全部转为http
        self.urls = UrlManager()
        self.down = Downloader()
        self.root = root
        self.threadnum = threadnum
        self.domain = domain


    def judge(self, domain, url):  # 判断链接的域名
        if(url.find(domain) != -1):
            return True
        else:
            return  False

    def parse(self, page_url, content): #解析页面
        if content is None:
            return
        soup = BeautifulSoup(content, 'lxml')
        news = self.get_new_urls(page_url, soup)
        return news

    def get_new_urls(self,page_url,soup): #从页面里面获得a标签列表,并组成新地址
        new_urls = set()
        links = soup.find_all('a')
        for link in links:
            new_url = link.get('href')
            new_full_url = urljoin(page_url, new_url)
            if self.Tohttp:
                new_full_url.replace("https", "http")
            if(self.judge(self.domain,new_full_url)):
                new_urls.add(new_full_url)
        return new_urls

    def craw(self):    # 控制流程,利用多线程发起请求
        self.urls.add_new_url(self.root)
        while self.urls.has_new_url():
            content = []
            th = []
            for i in list(range(self.threadnum)):
                if self.urls.has_new_url() is False:
                    break
                new_url = self.urls.get_new_url()

                print("craw: " + new_url)
                t = threading.Thread(target=self.down.get, args=(new_url, content))
                t.start()
                th.append(t)
            for t in th:
                t.join()
            for _str in content:
                if _str is None:
                    print("Nothing here")
                    continue

                new_urls = self.parse(new_url, _str)
                self.urls.add_new_urls(new_urls)

    def all(self):
        print('[+] ALL ' + str(len(self.urls.old_urls)))

if __name__ == '__main__':
    url= 'http://www.andseclab.cn'
    domain = 'andseclab.cn'
    spider = SpiderMain(url, domain,  10, False)
    spider.craw()
    print('[+]  All ' + str(len(spider.urls.old_urls)))

发表评论

电子邮件地址不会被公开。 必填项已用*标注

此站点使用Akismet来减少垃圾评论。了解我们如何处理您的评论数据