发布于 

python爬虫-自动爬取GitHub的issues图片

代码

import concurrent.futures
import time
from pathlib import Path
import requests
from lxml import etree

headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/118.0.0.0 Mobile Safari/537.36 Edg/118.0.2088.46'
}


def download_image(bug_id, img_url, img_index):
try:
response_img = requests.get(img_url)
response_img.raise_for_status()
folder_path = Path(f'vscode-bug-png/{bug_id}')
folder_path.mkdir(parents=True, exist_ok=True)
filename = folder_path / f'{bug_id}-{img_index}.png'
with open(filename, 'wb') as img_file:
img_file.write(response_img.content)
print(f'图片下载成功: {filename}')
except Exception as e:
print(f'图片下载失败: {e}')


def process_bug_data(bug_id, timestamp):
if int(bug_id) > 95766:
print(f'Bug-id: {bug_id}, Timestamp: {timestamp}')
url = f'https://github.com/microsoft/vscode/issues/{bug_id}'
try:
session = requests.Session()
response = session.get(url, headers=headers)
data = response.content.decode()
tree = etree.HTML(data)
image_urls = tree.xpath('(//div[@class="edit-comment-hide"])[1]//p//a[1]/img[1]/@src')
print(image_urls)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
futures = []
for i, img_url in enumerate(image_urls):
future = executor.submit(download_image, bug_id, img_url, i)
futures.append(future)
time.sleep(1)

for future in concurrent.futures.as_completed(futures):
try:
future.result() # 预防未处理的异常
except Exception as e:
print(f'处理图片下载任务时出错: {e}')
except Exception as e:
print(f'处理Bug数据时出错: {e}')


def main():
with open('/Users/weijiajin/Downloads/DBRD数据集/vscode/timestamp_file.txt', 'r') as file:
data = {}
for line in file:
bug_id, timestamp = line.strip().split('=')
data[bug_id] = timestamp

for bug_id, timestamp in data.items():
process_bug_data(bug_id, timestamp)


if __name__ == '__main__':
main()