姜鹏辉的个人博客 GreyNius

【NLP】LDA主题分类

2020-07-15

简介

LDA全称Latent Dirichlet Allocation

使用方法-sklearn

使用方法-gensim

from gensim import corpora, models, similarities
import gensim


docs # 格式为list, ["字符串1","字符串2"]
texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in docs]

# 建立字典
dictionary = corpora.Dictionary(texts) 

# 转换文本数据为索引,并计数
corpus = [dictionary.doc2bow(text) for text in texts]

#计算tf-idf值 这里没啥用
corpus_tfidf = models.TfidfModel(corpus)[corpus]


lda = models.LdaModel(corpus, num_topics=10, id2word=dictionary,
                      alpha='auto', eta='auto',
                      update_every=1, chunksize=100, passes=1)

# 所有文档的主题分布
doc_topics = lda.get_document_topics(corpus)

# 统计主题分布
distri_ = [0] * num_topics
cnt_ = 0
for i in doc_topics:
    if get_locmax(i[:])[1] < 0.15:
        cnt_ += 1
        continue
    #print(get_locmax(i[:]))
    distri_[ get_locmax(i[:])[0] ] += 1
print("{}个主题分布为{}".format(num_topics,distri_))
print("未知主题的有{}个".format(cnt_))

print("-------------")
num_show_term = 10  # 每个主题显示几个词
print('8.结果:每个主题的词分布:--')
for topic_id in range(num_topics):
    #print('主题#%d:\t' % topic_id)
    term_distribute_all = lda.get_topic_terms(topicid=topic_id)
    term_distribute = term_distribute_all[:num_show_term]
    term_distribute = np.array(term_distribute)
    term_id = term_distribute[:, 0].astype(np.int)
    #print('词:\t', )
    for t in term_id:
        print(dictionary.id2token[t], end=',')
    #print('\n概率:', term_distribute[:, 1])
    print()

参考


Comments

Content