TF-IDF算法示例代码

Gail ·

更新时间:2024-11-15

· 535 次阅读

# TF-IDF算法示例
# 0.引入依赖
import numpy as np
import pandas as pd
import math
# 1.定义数据和预处理
docA = "The cat sat on my bed"
docB = "The dog sat on my knees"
bowA = docA.split(" ")
bowB = docB.split(" ")
# print(bowA, bowB)
# 构建词库
wordSet = set(bowA).union(bowB)
# print(wordSet)
# 2.进行词数统计
# 用统计字典来保存词出现的次数
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)
# print(wordDictA)
# 遍历文档，统计词数
for word in bowA:
    wordDictA[word] += 1
for word in bowB:
    wordDictB[word] += 1
# print(wordDictA)
pd.DataFrame([wordDictA, wordDictB])
# result = pd.DataFrame([wordDictA, wordDictB])
# print(result)
# 3.计算词频TF
def computeTF(wordDict, bow):
    # 用一个字典对象记录TF,把所有的词对应在bow文档里的tf计算出来
    tfDict = {}
    nbowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / nbowCount
    return tfDict
tfA = computeTF(wordDictA, bowA)
tfB = computeTF(wordDictB, bowB)
# results = pd.DataFrame([tfA, tfB])
# print(results)
# 4.计算逆文档频率idf
def computeIDF(wordDictList, ):
    # 用一个字典对象保存idf结果，每个词作为key,初始值为0
    idfDict = dict.fromkeys(wordDictList[0], 0)
    N = len(wordDictList)
    for wordDict in wordDictList:
        # 遍历字典中的每个词汇,统计Ni
        for word, count in wordDict.items():
            if count > 0:
                # 先把Ni增加1，存入到idDict
                idfDict[word] += 1
    # 已经得到所有词汇i对应的Ni,现在根据公式把它替换成为最后的idf值
    for word, ni in idfDict.items():
        idfDict[word] = math.log10((N + 1) / (ni + 1))
    return idfDict
idfs = computeIDF([wordDictA, wordDictB])
# print(idfs)
# 5.计算TF-IDF
def computeIFIDF(tf, idfs):
    tfidf = {}
    for word, tfval in tf.items():
        tfidf[word] = tfval * idfs[word]
    return tfidf
tfidfA = computeIFIDF(tfA, idfs)
tfidfB = computeIFIDF(tfB, idfs)
# print(tfidfA, tfidfB)
pd.DataFrame([tfidfA, tfidfB])
results = pd.DataFrame([tfidfA, tfidfB])
print(results)

作者：qq_41621342

tf-idf idf 示例

1024 个赞