NLP自然语言处理
读取文件 去除所有标点符号和换行符,并把所有大写变成小写; 合并相同的词,统计每个词出现的频率,并按照词频从大到小排序; 将结果按行输出到文件 out.txt。import re
def parse(text):
#使用正则表达式去除text文件内标点符号和换行符,替换为空格,
text = re.sub(r'[^\W]',' ',text)
#转换为小写
text = text.lower()
#生成所有单词的列表。split将输入的字符串分割后为列表
word_list = text.split(' ')
#去除空白单词,https://blog.csdn.net/Jerry_1126/article/details/84640993
#filter方法,若第一个参数为None,则默认去除序列中所有值为假的元素(None、False、0、''、()、[]、{})
word_list = filter(None,word_list)
word_cnt = { }
'''
>>> word_list = ['xxx','yyy','zzz','xxx','yyy','yyy']
>>> word_cnt = {}
>>> for word in word_list:
... if word not in word_cnt:
... word_cnt[word] = 0
... word_cnt[word] += 1
...
>>> word_cnt.items()
dict_items([('xxx', 2), ('yyy', 3), ('zzz', 1)])
'''
for word in word_list:
if word not in word_cnt:
word_cnt[word] = 0
word_cnt[word] += 1
'''
lambda函数,输入:输出
items为字典对应方法
>>> sorted_word_cnt = sorted(word_cnt.items(),key=lambda kv: kv[1],reverse=True)
>>> print(sorted_word_cnt)
[('yyy', 3), ('xxx', 2), ('zzz', 1)]
>>> sorted_word_cnt = sorted(word_cnt.items(),key=lambda kv: kv[0],reverse=True)
>>> print(sorted_word_cnt)
[('zzz', 1), ('yyy', 3), ('xxx', 2)]
>>> sorted_word_cnt = sorted(word_cnt.items(),key=lambda kv: kv[0],reverse=False)
>>> print(sorted_word_cnt)
[('xxx', 2), ('yyy', 3), ('zzz', 1)]
'''
sorted_word_cnt = sorted(word_cnt.items(),key=lambda kv: kv[1],reverse = True)
return sorted_word_cnt
with open('in.txt','r') as fin:
text = fin.read()
word_and_freq = parse(text)
with open('out.txt','w') as fout:
for word,freq in word_and_freq:
fout.write('{} {}\n'.format(word,freq))
JSON序列化
JSON(JavaScript Object Notation),所有事情都用设计的字符串来表示
两种黑箱:
第一种,输入这些杂七杂八的信息,比如 Python 字典,输出一个字符串;
第二种,输入这个字符串,可以输出包含原始信息的 Python 字典。
#json.dumps接受python基本数据类型,序列化为string
>>> params = {'name':'Icey','age':18,'home':'shanxi','weight':99.8}
>>> params_str = json.dumps(params)
>>> print('type of params_str = {},params_str = {}'.format(type(params_str),params))
type of params_str = ,params_str = {'name': 'Icey', 'age': 18, 'home': 'shanxi', 'weight': 99.8}
#json.loads()接收合法字符串,反序列化为python基本数据类型
#须要try catch
>>> original_params = json.loads(params_str)
>>> print('type of original_params = {},original_params = {}'.format(type(original_params),original_params))
type of original_params = ,original_params = {'name': 'Icey', 'age': 18, 'home': 'shanxi', 'weight': 99.8}
文件序列化json+open和read/write
with open('params.json', 'w') as fout:
params_str = json.dump(params, fout)
with open('params.json', 'r') as fin:
original_params = json.load(fin)