TF-IDF算法示例代码

Gail ·
更新时间:2024-11-15
· 535 次阅读

# TF-IDF算法示例 # 0.引入依赖 import numpy as np import pandas as pd import math # 1.定义数据和预处理 docA = "The cat sat on my bed" docB = "The dog sat on my knees" bowA = docA.split(" ") bowB = docB.split(" ") # print(bowA, bowB) # 构建词库 wordSet = set(bowA).union(bowB) # print(wordSet) # 2.进行词数统计 # 用统计字典来保存词出现的次数 wordDictA = dict.fromkeys(wordSet, 0) wordDictB = dict.fromkeys(wordSet, 0) # print(wordDictA) # 遍历文档,统计词数 for word in bowA: wordDictA[word] += 1 for word in bowB: wordDictB[word] += 1 # print(wordDictA) pd.DataFrame([wordDictA, wordDictB]) # result = pd.DataFrame([wordDictA, wordDictB]) # print(result) # 3.计算词频TF def computeTF(wordDict, bow): # 用一个字典对象记录TF,把所有的词对应在bow文档里的tf计算出来 tfDict = {} nbowCount = len(bow) for word, count in wordDict.items(): tfDict[word] = count / nbowCount return tfDict tfA = computeTF(wordDictA, bowA) tfB = computeTF(wordDictB, bowB) # results = pd.DataFrame([tfA, tfB]) # print(results) # 4.计算逆文档频率idf def computeIDF(wordDictList, ): # 用一个字典对象保存idf结果,每个词作为key,初始值为0 idfDict = dict.fromkeys(wordDictList[0], 0) N = len(wordDictList) for wordDict in wordDictList: # 遍历字典中的每个词汇,统计Ni for word, count in wordDict.items(): if count > 0: # 先把Ni增加1,存入到idDict idfDict[word] += 1 # 已经得到所有词汇i对应的Ni,现在根据公式把它替换成为最后的idf值 for word, ni in idfDict.items(): idfDict[word] = math.log10((N + 1) / (ni + 1)) return idfDict idfs = computeIDF([wordDictA, wordDictB]) # print(idfs) # 5.计算TF-IDF def computeIFIDF(tf, idfs): tfidf = {} for word, tfval in tf.items(): tfidf[word] = tfval * idfs[word] return tfidf tfidfA = computeIFIDF(tfA, idfs) tfidfB = computeIFIDF(tfB, idfs) # print(tfidfA, tfidfB) pd.DataFrame([tfidfA, tfidfB]) results = pd.DataFrame([tfidfA, tfidfB]) print(results)
作者:qq_41621342



tf-idf idf 示例

需要 登录 后方可回复, 如果你还没有账号请 注册新账号