爬取csdn博客指定博主的文章转成csdn

来源：钮旅网

代码思路:
step1: 爬取博主的所有博文的article_ids
step2: 根据article_id，爬取这篇文章的html，拿到我们想要的部分，并且
step3: 保存为html格式，再保存一个可读性更好的pdf格式

先访问某个博主的文章观察它的url格式

点击博主的头像获取的所有文章信息，在右键点击检查在点击网络在让鼠标向下滑动就能获取到接口信息

点击预览


import os
import random
import time
import requests
from lxml import etree
import pdfkit

author_name = input('请输入博主ID: ')
MAX_PAGE_NUM = 200
i = 1
sess = requests.Session()  # 创建一个session保持连接
agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.46.110 Safari/537.36'  # 设置浏览头伪装成浏览器
sess.headers['User-Agent'] = agent


def crawler_blog_by(author_name, article_id, title):
    """
    获取到博主id和文章id之后构造url进行访问在保存成html格式
    :param author_name:
    :param article_id:
    :param title:
    :return:
    """
    article_requst_url = f'https:///{author_name}/article/details/{article_id}'
    respone = sess.get(article_requst_url)
    select = etree.HTML(respone.text)
    head_msg = select.xpath(r'//head')[0]
    head_str = etree.tostring(head_msg, encoding='utf8', method='html').decode()  # 将标签对象转为str对象
    body_msg = select.xpath(r"//div[@id='content_views']")[0]
    body_str = etree.tostring(body_msg, encoding='utf8', method='html').decode()
    if not os.path.exists('craw_url'):
        os.mkdir('craw_url')
    title = title.replace("/", "-").replace(":", "").replace(": ", "")
    sane_name = f'{author_name}-{title}-{article_id}.html'
    with open(os.path.join('craw_url', sane_name), 'w',encoding='utf-8') as f:
        f.write(f"""
        <head>
        <meta charset="UTF-8">
        </head>

        {body_str}

        """)
        html_to_pdf(os.path.join('craw_url', sane_name))

        global i
        print(f'【info】博文{author_name}-{title}-{article_id}第{i}篇保存成功')
        i += 1


def html_to_pdf(file_html_name):
    pre_file_name = os.path.splitext(file_html_name)[0]
    wkhtmltopdf_options = {'enable-local-file-access': None}
    pdfkit.from_file(file_html_name, pre_file_name + '.pdf', options=wkhtmltopdf_options)


# 循环爬取分页html
for each in range(1, MAX_PAGE_NUM + 1):
    try:
        data = {'page':
                    each,
                'size':
                    20,
                'businessType':
                    'blog',
                'orderby': '',
                'noMore': False,
                'year': '',
                'month': '',
                'username': author_name}
        page_dict = sess.get('https:///community/home-api/v1/get-business-list', params=data).json()
        for article in page_dict['data']['list']:
            article_id = article['articleId']
            title = article['title']
            crawler_blog_by(author_name, article_id, title)
            time.sleep(random.uniform(0.4, 1.0))
    except Exception as e:
        print(e)  # log日志文件系统

因篇幅问题不能全部显示，请点此查看更多更全内容

查看全文