用Python写了个批量下载脚本,可以实现批量爬取AUTOSAR官网标准文档。没有做多线程,异常处理逻辑也基本没有,仅能在网络良好情况下实现基本功能而已。
默认下载R20-11版本,分类保存到D盘根目录,修改categorycategory_names 理论上可以下载其他版本(AUTOSAR官网不改版的前提下),具体category的值可以打开Chrome/Edge浏览器开发者模式找。

当然用迅雷/IDM下载工具的批量下载功能可能比这段程序更高效,写这个纯属手痒,主要为了学习研究。


from lxml import html
import requests
import os


def parse_web(web_url):
    src_list = []
    response = requests.Session().get(web_url)
    web_tree = html.fromstring(response.text)
    item_count = int(web_tree.xpath('count(//*[@id="standards"]/form/ul/li)'))
    for i in range(1, item_count + 1):
        doc_href = web_tree.xpath('//*[@id="standards"]/form/ul/li[{}]/div/p[2]/a/@href'.format(i))[0]
        # doc_filename = os.path.basename(doc_href)
        doc_url = base_url + doc_href
        doc_title = web_tree.xpath('//*[@id="standards"]/form/ul/li[{}]/div/p[2]/a/text()'.format(i))[0]
        doc_info = web_tree.xpath('//*[@id="standards"]/form/ul/li[{}]/div/p[1]/text()'.format(i))
        doc_info = doc_info[0].split(',')
        for ii, di in enumerate(doc_info):
            doc_info[ii] = di.replace('  ', '').replace('\r\n', '').replace('\t', '').strip()
        doc_cat = ','.join(doc_info)
        src_list.append({'title': doc_title, 'cat': doc_cat, 'url': doc_url})
    return src_list


def download(save_dir, src_url):
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)
    save_path = os.path.join(save_dir, os.path.basename(src_url))

    if os.path.exists(save_path):
        print('{} already exist'.format(os.path.basename(src_url)))
        return

    try:
        print('Downloading {}'.format(src_url))
        file_download = requests.get(src_url)
        with open(save_path, 'wb') as fp:
            fp.write(file_download.content)
            print('Download complete')
    except requests.exceptions:
        print('Error while downloading {}'.format(src_url))


if __name__ == '__main__':
    print('Get Ready')

    category = ['[145]=145',  # classic/20-11
                '[146]=146',  # adaptive/20-11
                '[147]=147',  # foundation/20-11
                '[32]=32',  # acceptance test/2016 R1.2
                '[148]=148']  # application interface/20-11
    category_names = ['classic platform',
                      'adaptive platform',
                      'foundation',
                      'acceptance test',
                      'application interface']

    base_url = 'https://www.autosar.org/'
    init_url = 'https://www.autosar.org/nc/document-search/?tx_sysgsearch_pi1[showAll]=1' \
               '&tx_sysgsearch_pi1[category]'

    doc_save_dir = 'D:\\autosar'
    for ci, ct in enumerate(category):
        cn = category_names[ci]
        doc_cat_dir = os.path.join(doc_save_dir, cn)
        if not os.path.isdir(doc_cat_dir):
            os.makedirs(doc_cat_dir)
        web_src_list = parse_web(init_url + ct)
        with open(os.path.join(doc_cat_dir, 'doc_info.txt'), 'w+', encoding='utf-8') as f_di:
            for web_src_item in web_src_list:
                f_di.write(web_src_item['title'] + '|' + os.path.basename(web_src_item['url']) + '\n|' +
                           web_src_item['cat'] + '\n|' + web_src_item['url'] + '\n\n')
        web_src_item = {}
        for web_src_item in web_src_list:
            download(doc_cat_dir, web_src_item['url'])

    print('ok')

标签: Python, AUTOSAR, 批量下载, lxml, requests

添加新评论