Python如何提取html中文本到txt

Diane ·
更新时间:2024-09-20
· 330 次阅读

目录

Python提取html中文本到txt

正则去标签方式

nltk

htmlParser

Python提取txt正则内容

总结

Python提取html中文本到txt 正则去标签方式 # -*- coding: utf-8 -*- import re def html_tag_rm(content: str): dr = re.compile(r'<[^>]+>',re.S) return dr.sub('',content) nltk

比较笨重

需要安装依赖 nltk, numpy, pyyaml

# -*- coding: utf-8 -*- import nltk def html_tag_rm(content: str): return nltk.clean_html(content) htmlParser import re from sys import stderr from traceback import print_exc from HTMLParser import HTMLParser class _DeHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.__text = [] def handle_data(self, data): text = data.strip() if len(text) > 0: text = re.sub('[ \t\r\n]+', ' ', text) self.__text.append(text + ' ') def handle_starttag(self, tag, attrs): if tag == 'p': self.__text.append('\n\n') elif tag == 'br': self.__text.append('\n') def handle_startendtag(self, tag, attrs): if tag == 'br': self.__text.append('\n\n') def text(self): return ''.join(self.__text).strip() def dehtml(text): try: parser = _DeHTMLParser() parser.feed(text) parser.close() return parser.text() except: print_exc(file=stderr) return text def main(): text = r''''' <html> <body> <b>Project:</b> DeHTML<br> <b>Description</b>:<br> This small script is intended to allow conversion from HTML markup to plain text. </body> </html> ''' print(dehtml(text)) if __name__ == '__main__': main() Python提取txt正则内容

其中:

pattern = re.compile(r'^.["“subject”"] [([^[])].*')

为修改的正则匹配部分

import re import pandas as pd with open("C:/data1.txt", 'r', encoding='UTF-8') as f: data = f.readlines() f.close() tol = [] for line in data: ##s = re.findall('[\u4e00-\u9fa5]', data) print(s) pattern = re.compile(r'^.*\[\"\"subject\"\"\] \[([^\[]*)\].*') string = str(line) url = re.findall(pattern,string) if (url is not None ) and (url != '[]'): tol.append(url) print(tol) pd.DataFrame(tol).to_csv('C:/tol2.csv') ##f1 = open("url.txt", "a+", encoding='utf-8') ##for urls in url: ## f1.write(urls + '\n') ##f1.close() ##reg = re.compile(r'^.*\[\"\"subject\"\"\] \[(.*)\]') ##msg = '""i;octet"" [""subject""] [""小木虫""] ,accounts :in_main [""2012207469@tju.edu.c' ##mtch = reg.match(msg) ##print(mtch.group(1)) 总结

以上为个人经验,希望能给大家一个参考,也希望大家多多支持软件开发网。



html中 HTML Python

需要 登录 后方可回复, 如果你还没有账号请 注册新账号