from sklearn feature_extraction text import TfidfVectorizer from sklea

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups
n_samples = 2000
n_features = 1000
n_topics = 2
n_top_words = 7
dataset = fetch_20newsgroups(subset='train', categories=['sci.space', 'rec.sport.hockey'])
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data[:n_samples])
nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(nmf.components_):
print("Topic #%d:\t" % (topic_idx))
print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))