import concurrent.futures import time from pathlib import Path import requests from lxml import etree
headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/118.0.0.0 Mobile Safari/537.36 Edg/118.0.2088.46' }
def download_image(bug_id, img_url, img_index): try: response_img = requests.get(img_url) response_img.raise_for_status() folder_path = Path(f'vscode-bug-png/{bug_id}') folder_path.mkdir(parents=True, exist_ok=True) filename = folder_path / f'{bug_id}-{img_index}.png' with open(filename, 'wb') as img_file: img_file.write(response_img.content) print(f'图片下载成功: {filename}') except Exception as e: print(f'图片下载失败: {e}')
def process_bug_data(bug_id, timestamp): if int(bug_id) > 95766: print(f'Bug-id: {bug_id}, Timestamp: {timestamp}') url = f'https://github.com/microsoft/vscode/issues/{bug_id}' try: session = requests.Session() response = session.get(url, headers=headers) data = response.content.decode() tree = etree.HTML(data) image_urls = tree.xpath('(//div[@class="edit-comment-hide"])[1]//p//a[1]/img[1]/@src') print(image_urls) with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: futures = [] for i, img_url in enumerate(image_urls): future = executor.submit(download_image, bug_id, img_url, i) futures.append(future) time.sleep(1)
for future in concurrent.futures.as_completed(futures): try: future.result() except Exception as e: print(f'处理图片下载任务时出错: {e}') except Exception as e: print(f'处理Bug数据时出错: {e}')
def main(): with open('/Users/weijiajin/Downloads/DBRD数据集/vscode/timestamp_file.txt', 'r') as file: data = {} for line in file: bug_id, timestamp = line.strip().split('=') data[bug_id] = timestamp
for bug_id, timestamp in data.items(): process_bug_data(bug_id, timestamp)
if __name__ == '__main__': main()
|