简介
LDA全称Latent Dirichlet Allocation
使用方法-sklearn
- 【带你玩转主题模型Topic Model】之 利用sklearn实现Latetnt Dirichlet Allocation(LDA)主题模型
from sklearn.decomposition import LatentDirichletAllocation cntVector = CountVectorizer(stop_words=stopwords) cntTf = cntVector.fit_transform(texts) lda = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', learning_offset=50.) docres = lda.fit_transform(cntTf)
参数
- n_components:主题数,在0.19版本以前叫
n_topics
- learning_method:
batch
和online
两种,数据量大的情况下用online
更快,默认是batch
使用方法-gensim
from gensim import corpora, models, similarities
import gensim
docs # 格式为list, ["字符串1","字符串2"]
texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in docs]
# 建立字典
dictionary = corpora.Dictionary(texts)
# 转换文本数据为索引,并计数
corpus = [dictionary.doc2bow(text) for text in texts]
#计算tf-idf值 这里没啥用
corpus_tfidf = models.TfidfModel(corpus)[corpus]
lda = models.LdaModel(corpus, num_topics=10, id2word=dictionary,
alpha='auto', eta='auto',
update_every=1, chunksize=100, passes=1)
# 所有文档的主题分布
doc_topics = lda.get_document_topics(corpus)
# 统计主题分布
distri_ = [0] * num_topics
cnt_ = 0
for i in doc_topics:
if get_locmax(i[:])[1] < 0.15:
cnt_ += 1
continue
#print(get_locmax(i[:]))
distri_[ get_locmax(i[:])[0] ] += 1
print("{}个主题分布为{}".format(num_topics,distri_))
print("未知主题的有{}个".format(cnt_))
print("-------------")
num_show_term = 10 # 每个主题显示几个词
print('8.结果:每个主题的词分布:--')
for topic_id in range(num_topics):
#print('主题#%d:\t' % topic_id)
term_distribute_all = lda.get_topic_terms(topicid=topic_id)
term_distribute = term_distribute_all[:num_show_term]
term_distribute = np.array(term_distribute)
term_id = term_distribute[:, 0].astype(np.int)
#print('词:\t', )
for t in term_id:
print(dictionary.id2token[t], end=',')
#print('\n概率:', term_distribute[:, 1])
print()