Python中如何抓取企名片网站的数据?


Python中如何抓取企名片网站的数据?

2 回复
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

class QimingpianCrawler:
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
    def get_company_list(self, page=1):
        """获取公司列表页数据"""
        url = f"https://www.qimingpian.com/finosda/project/pinvestment?p={page}"
        
        try:
            response = self.session.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            
            # 解析JSON数据(企名片通常使用动态加载)
            data = response.json()
            
            # 提取公司信息
            companies = []
            for item in data.get('data', {}).get('list', []):
                company_info = {
                    'company_name': item.get('pro_name'),
                    'industry': item.get('industry'),
                    'location': item.get('city'),
                    'investment_round': item.get('round'),
                    'amount': item.get('money'),
                    'date': item.get('date'),
                    'investors': item.get('investors')
                }
                companies.append(company_info)
            
            return companies
            
        except Exception as e:
            print(f"获取第{page}页数据失败: {e}")
            return []
    
    def get_company_detail(self, company_id):
        """获取公司详情页数据"""
        url = f"https://www.qimingpian.com/company/detail?key={company_id}"
        
        try:
            response = self.session.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 提取详情信息(根据实际页面结构调整选择器)
            detail_info = {
                'company_id': company_id,
                'description': self._extract_text(soup, '.company-description'),
                'website': self._extract_attr(soup, '.website-link', 'href'),
                'found_time': self._extract_text(soup, '.found-time'),
                'team_info': self._extract_text(soup, '.team-info'),
                'business_model': self._extract_text(soup, '.business-model')
            }
            
            return detail_info
            
        except Exception as e:
            print(f"获取公司{company_id}详情失败: {e}")
            return {}
    
    def _extract_text(self, soup, selector):
        """辅助方法:提取文本"""
        element = soup.select_one(selector)
        return element.get_text(strip=True) if element else ''
    
    def _extract_attr(self, soup, selector, attr):
        """辅助方法:提取属性"""
        element = soup.select_one(selector)
        return element.get(attr) if element else ''
    
    def crawl_multiple_pages(self, start_page=1, end_page=5):
        """爬取多页数据"""
        all_companies = []
        
        for page in range(start_page, end_page + 1):
            print(f"正在爬取第 {page} 页...")
            
            companies = self.get_company_list(page)
            all_companies.extend(companies)
            
            # 随机延迟,避免请求过快
            time.sleep(random.uniform(1, 3))
            
            # 如果需要获取详情,可以进一步处理
            # for company in companies:
            #     detail = self.get_company_detail(company.get('id'))
            #     company.update(detail)
        
        return all_companies
    
    def save_to_csv(self, data, filename='qimingpian_data.csv'):
        """保存数据到CSV文件"""
        if data:
            df = pd.DataFrame(data)
            df.to_csv(filename, index=False, encoding='utf-8-sig')
            print(f"数据已保存到 {filename}")
        else:
            print("没有数据可保存")

# 使用示例
if __name__ == "__main__":
    crawler = QimingpianCrawler()
    
    # 爬取前3页数据
    data = crawler.crawl_multiple_pages(start_page=1, end_page=3)
    
    # 保存数据
    crawler.save_to_csv(data)
    
    # 查看爬取的数据
    print(f"共爬取 {len(data)} 条记录")
    if data:
        print("前5条记录示例:")
        for i, item in enumerate(data[:5]):
            print(f"{i+1}. {item}")

关键点说明:

  1. 动态数据加载:企名片使用AJAX加载数据,直接解析HTML可能拿不到数据。通过浏览器开发者工具查看Network请求,找到真正的数据接口。

  2. 反爬虫策略

    • 设置合理的请求头(User-Agent等)
    • 使用Session保持会话
    • 添加随机延迟避免频繁请求
    • 处理可能需要的cookies
  3. 数据解析

    • 列表页通常是JSON格式,直接用response.json()解析
    • 详情页用BeautifulSoup解析HTML
    • 选择器需要根据实际页面结构调整
  4. 数据存储:使用pandas保存为CSV,方便后续分析

注意:实际使用时需要根据网站当前结构调整选择器和API参数,爬取前请遵守网站的robots.txt和服务条款。

总结:先分析网络请求找到真实数据接口,再模拟请求获取数据。


又是你

回到顶部