Python实现快速保存微信公众号文章中的图片

Tanisha ·

更新时间:2024-11-13

· 1897 次阅读

一、实现效果(以槿泉壁纸为例)

二、实现过程

三、源码

四、Python正则表达式匹配日期与时间

一、实现效果(以槿泉壁纸为例)

二、实现过程

1.新建一个link文本，将需要下载的文章链接依次保存；

2.新建一个.py文件，将下面的源码复制进去；

3.新建一个pic文件夹，用来保存图片；

4.运行即可；

三、源码

sound code

代码如下（示例）：

import requests
from re import findall
from bs4 import BeautifulSoup
import time
import os
import sys
weixin_title=""
weixin_time=""
#获取微信公众号内容,保存标题和时间
def get_weixin_html(url):
    global weixin_time,weixin_title
    res=requests.get(url)
    soup=BeautifulSoup(res.text,"html.parser")
    #获取标题
    temp=soup.find('h1')
    weixin_title=temp.string.strip()
    #使用正则表达式获取时间
#    result=findall(r'[0-9]{4}-[0-9]{2}-[0-9]{2}.+:[0-9]{2}',res.text)
    result=findall(r"(\d{4}-\d{1,2}-\d{1,2})",res.text)
    weixin_time=result[0]
    #获取正文html并修改
    content=soup.find(id='js_content')
    soup2=BeautifulSoup((str(content)),"html.parser")
    soup2.div['style']='visibility: visible;'
    html=str(soup2)
    pattern=r'http[s]?:\/\/[a-z.A-Z_0-9\/\?=-_-]+'
    result = findall(pattern, html)
    #将data-src修改为src
    for url in result:
        html=html.replace('data-src="'+url+'"','src="'+url+'"')
    return html
#上传图片至服务器
def download_pic(content):
    pic_path= 'pic/' + str(path)+ '/'
    if not os.path.exists(pic_path):
        os.makedirs(pic_path)
    #使用正则表达式查找所有需要下载的图片链接
    pattern=r'http[s]?:\/\/[a-z.A-Z_0-9\/\?=-_-]+'
    pic_list = findall(pattern, content)
    for index, item in enumerate(pic_list,1):
        count=1
        flag=True
        pic_url=str(item)
        while flag and count<=10:
            try:
                 data=requests.get(pic_url);
                 if pic_url.find('png')>0:
                     file_name = str(index)+'.png'
                 elif pic_url.find('gif')>0:
                     file_name=str(index)+'.gif'
                 else:
                     file_name=str(index)+'.jpg'
                 with open( pic_path + file_name,"wb") as f:
                     f.write(data.content)
                 #将图片链接替换为本地链接
                 content = content.replace(pic_url, pic_path + file_name)
                 flag = False
                 print('已下载第' + str(index) +'张图片.')
                 count += 1
                 time.sleep(1)
            except:
                 count+=1
                 time.sleep(1)
        if count>10:
            print("下载出错：",pic_url)
    return content
def get_link(dir):
    link = []
    with open(dir,'r') as file_to_read:
        while True:
            line = file_to_read.readline()
            if not line:
                break
            line = line.strip('\n')
            link.append(line)
    return link
path = 'link.txt'
linklist = get_link(path)
print(linklist)
s = len(linklist)
if __name__ == "__main__":
    #获取html
    input_flag=True
    while input_flag:
#        for j in range(0,s):
#            pic = str(j)
        j = 1
        for i in linklist:
            weixin_url = i  
            path = j
            j += 1     
            #weixin_url=input()
            re=findall(r'http[s]?:\/\/mp.weixin.qq.com\/s\/[0-9a-zA-Z_]+',weixin_url) 
            if len(re)<=0:
                    print("链接有误，请重新输入!")
            else:
                input_flag=False
            content=get_weixin_html(weixin_url)
            content=download_pic(content)
            #保存至本地
            with open(weixin_title+'.txt','w+',encoding="utf-8") as f:
                f.write(content) 
            with open(weixin_title+'.html','w+',encoding="utf-8") as f:
                f.write(content)  
            print()
            print("标题：《"+weixin_title+"》")
            print("发布时间："+weixin_time)

四、Python正则表达式匹配日期与时间


import re
from datetime import datetime
test_date = '小明的生日是2016-12-12 14:34,小张的生日是2016-12-21 11:34 .'
test_datetime = '小明的生日是2016-12-12 14:34,.小晴的生日是2016-12-21 11:34,好可爱的.'
# date
mat = re.search(r"(\d{4}-\d{1,2}-\d{1,2})",test_date)
print mat.groups()
# ('2016-12-12',)
print mat.group(0)
# 2016-12-12
date_all = re.findall(r"(\d{4}-\d{1,2}-\d{1,2})",test_date)
for item in date_all:
    print item
# 2016-12-12
# 2016-12-21
# datetime
mat = re.search(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",test_datetime)
print mat.groups()
# ('2016-12-12 14:34',)
print mat.group(0)
# 2016-12-12 14:34
date_all = re.findall(r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2})",test_datetime)
for item in date_all:
    print item
# 2016-12-12 14:34
# 2016-12-21 11:34
## 有效时间
# 如这样的日期2016-12-35也可以匹配到.测试如下.
test_err_date = '如这样的日期2016-12-35也可以匹配到.测试如下.'
print re.search(r"(\d{4}-\d{1,2}-\d{1,2})",test_err_date).group(0)
# 2016-12-35
# 可以加个判断
def validate(date_text):
    try:
        if date_text != datetime.strptime(date_text, "%Y-%m-%d").strftime('%Y-%m-%d'):
            raise ValueError
        return True
    except ValueError:
        # raise ValueError("错误是日期格式或日期,格式是年-月-日")
        return False
print validate(re.search(r"(\d{4}-\d{1,2}-\d{1,2})",test_err_date).group(0))
# false
# 其他格式匹配. 如2016-12-24与2016/12/24的日期格式.
date_reg_exp = re.compile('\d{4}[-/]\d{2}[-/]\d{2}')
test_str= """
     平安夜圣诞节2016-12-24的日子与去年2015/12/24的是有不同哦.
     """
# 根据正则查找所有日期并返回
matches_list=date_reg_exp.findall(test_str)
# 列出并打印匹配的日期
for match in matches_list:
  print match
# 2016-12-24
# 2015/12/24

以上就是Python实现快速保存微信公众号文章中的图片的详细内容，更多关于Python保存文章图片的资料请关注软件开发网其它相关文章！

微信公众号图片微信公众号 Python

1024 个赞