Python代码批量下载AUTOSAR标准文档
用Python写了个批量下载脚本,可以实现批量爬取AUTOSAR官网标准文档。没有做多线程,异常处理逻辑也基本没有,仅能在网络良好情况下实现基本功能而已。
默认下载R20-11版本,分类保存到D盘根目录,修改category 和category_names 理论上可以下载其他版本(AUTOSAR官网不改版的前提下),具体category的值可以打开Chrome/Edge浏览器开发者模式找。
当然用迅雷/IDM下载工具的批量下载功能可能比这段程序更高效,写这个纯属手痒,主要为了学习研究。
from lxml import html
import requests
import os
def parse_web(web_url):
src_list = []
response = requests.Session().get(web_url)
web_tree = html.fromstring(response.text)
item_count = int(web_tree.xpath('count(//*[@id="standards"]/form/ul/li)'))
for i in range(1, item_count + 1):
doc_href = web_tree.xpath('//*[@id="standards"]/form/ul/li[{}]/div/p[2]/a/@href'.format(i))[0]
# doc_filename = os.path.basename(doc_href)
doc_url = base_url + doc_href
doc_title = web_tree.xpath('//*[@id="standards"]/form/ul/li[{}]/div/p[2]/a/text()'.format(i))[0]
doc_info = web_tree.xpath('//*[@id="standards"]/form/ul/li[{}]/div/p[1]/text()'.format(i))
doc_info = doc_info[0].split(',')
for ii, di in enumerate(doc_info):
doc_info[ii] = di.replace(' ', '').replace('\r\n', '').replace('\t', '').strip()
doc_cat = ','.join(doc_info)
src_list.append({'title': doc_title, 'cat': doc_cat, 'url': doc_url})
return src_list
def download(save_dir, src_url):
if not os.path.isdir(save_dir):
os.mkdir(save_dir)
save_path = os.path.join(save_dir, os.path.basename(src_url))
if os.path.exists(save_path):
print('{} already exist'.format(os.path.basename(src_url)))
return
try:
print('Downloading {}'.format(src_url))
file_download = requests.get(src_url)
with open(save_path, 'wb') as fp:
fp.write(file_download.content)
print('Download complete')
except requests.exceptions:
print('Error while downloading {}'.format(src_url))
if __name__ == '__main__':
print('Get Ready')
category = ['[145]=145', # classic/20-11
'[146]=146', # adaptive/20-11
'[147]=147', # foundation/20-11
'[32]=32', # acceptance test/2016 R1.2
'[148]=148'] # application interface/20-11
category_names = ['classic platform',
'adaptive platform',
'foundation',
'acceptance test',
'application interface']
base_url = 'https://www.autosar.org/'
init_url = 'https://www.autosar.org/nc/document-search/?tx_sysgsearch_pi1[showAll]=1' \
'&tx_sysgsearch_pi1[category]'
doc_save_dir = 'D:\\autosar'
for ci, ct in enumerate(category):
cn = category_names[ci]
doc_cat_dir = os.path.join(doc_save_dir, cn)
if not os.path.isdir(doc_cat_dir):
os.makedirs(doc_cat_dir)
web_src_list = parse_web(init_url + ct)
with open(os.path.join(doc_cat_dir, 'doc_info.txt'), 'w+', encoding='utf-8') as f_di:
for web_src_item in web_src_list:
f_di.write(web_src_item['title'] + '|' + os.path.basename(web_src_item['url']) + '\n|' +
web_src_item['cat'] + '\n|' + web_src_item['url'] + '\n\n')
web_src_item = {}
for web_src_item in web_src_list:
download(doc_cat_dir, web_src_item['url'])
print('ok')