Python中如何抓取企名片网站的数据?
这个网站好像是一个账号请求接口多少次就被封了
Python中如何抓取企名片网站的数据?
2 回复
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
class QimingpianCrawler:
def __init__(self):
self.session = requests.Session()
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
def get_company_list(self, page=1):
"""获取公司列表页数据"""
url = f"https://www.qimingpian.com/finosda/project/pinvestment?p={page}"
try:
response = self.session.get(url, headers=self.headers, timeout=10)
response.raise_for_status()
# 解析JSON数据(企名片通常使用动态加载)
data = response.json()
# 提取公司信息
companies = []
for item in data.get('data', {}).get('list', []):
company_info = {
'company_name': item.get('pro_name'),
'industry': item.get('industry'),
'location': item.get('city'),
'investment_round': item.get('round'),
'amount': item.get('money'),
'date': item.get('date'),
'investors': item.get('investors')
}
companies.append(company_info)
return companies
except Exception as e:
print(f"获取第{page}页数据失败: {e}")
return []
def get_company_detail(self, company_id):
"""获取公司详情页数据"""
url = f"https://www.qimingpian.com/company/detail?key={company_id}"
try:
response = self.session.get(url, headers=self.headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# 提取详情信息(根据实际页面结构调整选择器)
detail_info = {
'company_id': company_id,
'description': self._extract_text(soup, '.company-description'),
'website': self._extract_attr(soup, '.website-link', 'href'),
'found_time': self._extract_text(soup, '.found-time'),
'team_info': self._extract_text(soup, '.team-info'),
'business_model': self._extract_text(soup, '.business-model')
}
return detail_info
except Exception as e:
print(f"获取公司{company_id}详情失败: {e}")
return {}
def _extract_text(self, soup, selector):
"""辅助方法:提取文本"""
element = soup.select_one(selector)
return element.get_text(strip=True) if element else ''
def _extract_attr(self, soup, selector, attr):
"""辅助方法:提取属性"""
element = soup.select_one(selector)
return element.get(attr) if element else ''
def crawl_multiple_pages(self, start_page=1, end_page=5):
"""爬取多页数据"""
all_companies = []
for page in range(start_page, end_page + 1):
print(f"正在爬取第 {page} 页...")
companies = self.get_company_list(page)
all_companies.extend(companies)
# 随机延迟,避免请求过快
time.sleep(random.uniform(1, 3))
# 如果需要获取详情,可以进一步处理
# for company in companies:
# detail = self.get_company_detail(company.get('id'))
# company.update(detail)
return all_companies
def save_to_csv(self, data, filename='qimingpian_data.csv'):
"""保存数据到CSV文件"""
if data:
df = pd.DataFrame(data)
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"数据已保存到 {filename}")
else:
print("没有数据可保存")
# 使用示例
if __name__ == "__main__":
crawler = QimingpianCrawler()
# 爬取前3页数据
data = crawler.crawl_multiple_pages(start_page=1, end_page=3)
# 保存数据
crawler.save_to_csv(data)
# 查看爬取的数据
print(f"共爬取 {len(data)} 条记录")
if data:
print("前5条记录示例:")
for i, item in enumerate(data[:5]):
print(f"{i+1}. {item}")
关键点说明:
-
动态数据加载:企名片使用AJAX加载数据,直接解析HTML可能拿不到数据。通过浏览器开发者工具查看Network请求,找到真正的数据接口。
-
反爬虫策略:
- 设置合理的请求头(User-Agent等)
- 使用Session保持会话
- 添加随机延迟避免频繁请求
- 处理可能需要的cookies
-
数据解析:
- 列表页通常是JSON格式,直接用
response.json()解析 - 详情页用BeautifulSoup解析HTML
- 选择器需要根据实际页面结构调整
- 列表页通常是JSON格式,直接用
-
数据存储:使用pandas保存为CSV,方便后续分析
注意:实际使用时需要根据网站当前结构调整选择器和API参数,爬取前请遵守网站的robots.txt和服务条款。
总结:先分析网络请求找到真实数据接口,再模拟请求获取数据。
又是你

