攻击面插件网站爬虫开发

资产搜集思考

在过往的打点经历中,发现有好多资产并不是备案、域名、C段等常规手段获取,好多都是主页或其他页面存在跳转。 因此花费几个小时时间把这个攻击面插件写出来。

插件简单设计

本次设计可能很简单,毕竟是个快速出炉的脚本

爬取指定url链接

过滤link

	设置了4个子集,url_subdomain、subdomain、url_ip、ip
 
	如果是ip
 
		直接将link填入url_ip以及ip中
  
	如果是domain
 
		判断link是否包含login、admin、sign、auth关键字
  
		判断是否与输入 URL 具有相同主域名
  
		判断是否同备案(需要根据实际情况实现)
  
		通过任一均填入url_subdomain以及subdomain中

插件效果

image

插件源码

应该是有bug的,反正自用,之后再改。

websiteinfo.py

import requests
from bs4 import BeautifulSoup
import ipaddress
from urllib.parse import urlparse
import beian


def get_external_links(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0',
        'Referer': 'https://www.baidu.com/'
    }
    # 发送请求并获取网页内容
    try:
        response = requests.get(url, headers=headers, verify=False, timeout=3)
        if response.status_code != 200:
            return []
    except:
        return []
    # 使用 BeautifulSoup 解析网页内容
    soup = BeautifulSoup(response.content, 'html.parser')

    # 提取所有链接
    links = soup.find_all('a', href=True)

    # 过滤外链
    external_links = []
    for link in links:
        href = link['href']
        # 判断是否是外链
        if is_external_link(href):
            external_links.append(href)

    return external_links


def is_external_link(href):
    # 忽略锚链接
    if href.startswith('#'):
        return False

    # 忽略javascript链接
    if href.startswith('javascript:'):
        return False

    # 判断是否以http或https开头
    if not href.startswith('http://') and not href.startswith('https://'):
        return False

    return True


def filter_external_links(extern_links, url):
    result = {
        'url_subdomain': set(),
        'subdomain': set(),
        'url_ip': set(),
        'ip': set(),
    }

    for link in extern_links:
        # 判断链接类型
        if is_ip_address(link):
            # IP 地址
            result['url_ip'].add(link)
            result['ip'].add(urlparse(link).hostname)
        else:
            # 域名
            # 判断link是否包含login或admin关键字
            if 'login' in link.lower() or 'admin' in link.lower() or 'sign' in link.lower() or 'auth' in link.lower():
                result['url_subdomain'].add(link)
            # 判断是否与输入 URL 具有相同主域名
            domain = urlparse(link).hostname
            if is_same_subdomain(domain, urlparse(url).hostname):
                result['url_subdomain'].add(link)
                result['subdomain'].add(domain)
            else:
                # 判断是否同备案(需要根据实际情况实现)
                if is_recorded(domain, urlparse(url).hostname):
                    result['url_subdomain'].add(link)
                    result['subdomain'].add(domain)
    return {key: list(value) for key, value in result.items()}


def is_ip_address(link):
    try:
        parse_link = urlparse(link)
        ipaddress.ip_address(parse_link.hostname)
        return True
    except ValueError:
        return False


def is_same_subdomain(domain1, domain2):
    parts1 = domain1.split('.')
    parts2 = domain2.split('.')

    # 至少需要两个部分
    if len(parts1) < 2 or len(parts2) < 2:
        return False
    # 比较主域名
    return parts1[-2] == parts2[-2]


def is_recorded(domain1, domain2):
    # domain type: test.com
    domain_part1 = domain1.split('.')
    domain_part2 = domain2.split('.')

    # 至少需要两个部分
    if len(domain_part1) < 2 or len(domain_part2) < 2:
        return False

    try:
        parts1 = beian.icp_search(domain_part1[-2])['Company_Name']
        parts2 = beian.icp_search(domain_part2[-2])['Company_Name']
        print(domain_part1[-2], domain_part2[-2])
        if parts1 == parts2:
            return True
        else:
            return False
    except:
        return False


def get_site_info_result(url):
    return filter_external_links(get_external_links(url), url)


if __name__ == '__main__':
    url = 'https://www.hnyjj.org.cn'
    external_links = get_external_links(url)
    result = filter_external_links(external_links, url)
    print(result)

beian.py

import time

import requests
from bs4 import BeautifulSoup
import warnings

warnings.filterwarnings("ignore")


def extract_data(html):
    result = {}

    soup = BeautifulSoup(html, 'html.parser')

    type = soup.select_one('td.th:contains(备案类型) + td span').text
    company = soup.select_one('td.th:contains(备案主体) + td a').text
    number = soup.select_one('td.th:contains(备案号) + td a').text
    start = soup.select_one('td.th:contains(备案时间) + td span').text
    end = soup.select_one('td.th:contains(备案时间) + td span').next_sibling.text
    verify_time = start + "-" + end

    result["typ"] = type
    result["comName"] = company
    result["license"] = number
    result["verifyTime"] = verify_time

    return result


def icp_search0(domain):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63",
            "Content-Type": "application/json"
        }
        res = requests.get("https://icplishi.com/" + domain + "/", headers=headers, verify=False).text

        if "备案主体" not in res:
            return None
        info = extract_data(res)
        formatted_data = {"ICP_Type": info["typ"].strip(), "Company_Name": info["comName"].strip(),
                          "ICP_Number": info["license"].strip(), "Verify_Time": info["verifyTime"].strip()}
        return formatted_data
    except Exception as e:
        return None


def icp_search1(domain):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63",
            "Content-Type": "application/json"
        }
        res = requests.get("https://www.aizhan.com/cha/" + domain + "/", headers=headers, verify=False).text

        if "备案信息" not in res:
            return None
        soup = BeautifulSoup(res, 'html.parser')
        icp_number = soup.find("a", id="icp_icp").text
        icp_type = soup.find("span", id="icp_type").text
        icp_company = soup.find("span", id="icp_company").text
        icp_passtime = soup.find("span", id="icp_passtime").text

        formatted_data = {"ICP_Type": icp_type.strip(), "Company_Name": icp_company.strip(),
                          "ICP_Number": icp_number.strip(), "Verify_Time": icp_passtime.strip()}
        # 防止请求过快被禁
        time.sleep(2)
        return formatted_data
    except Exception as e:
        return None


def icp_search(domain):
    if icp_search0(domain):
        return icp_search0(domain)
    else:
        return icp_search1(domain)