-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathseed_gensim.py
74 lines (65 loc) · 3.14 KB
/
seed_gensim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
""" Uses gensim to analyze the text of the responses to the main questions of the SEED Survey """
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import gensim
import pyLDAvis.gensim
def create_topic_model(seed_arguments, list_responses):
""" Using LDA from gensim, create the topic model from the list of responses """
topic_model_dictionary, texts_to_analyze = create_topic_model_dictionary(
list_responses)
# convert tokenized documents into a document-term matrix, or the corpus
topic_model_corpus = [
topic_model_dictionary.doc2bow(text) for text in texts_to_analyze
]
# generate LDA model from the texts_to_analyze and the topic_model_dictionary
lda_model = gensim.models.ldamodel.LdaModel(
topic_model_corpus,
id2word=topic_model_dictionary,
num_topics=seed_arguments.num_topics,
passes=seed_arguments.num_passes,
alpha=seed_arguments.alpha,
eta=seed_arguments.eta)
return lda_model, topic_model_corpus, topic_model_dictionary, texts_to_analyze
def create_topic_model_dictionary(list_responses):
""" Create a topic model dictionary from the list of responses """
# create the objects needed to prepare the dictionary
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
# p_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
texts_to_analyze = []
# loop through the list of responses
for i in list_responses:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove the stop words from tokens
keep_tokens = [i for i in tokens if not i in en_stop]
keep_tokens = [i for i in keep_tokens if not i.isnumeric()]
keep_tokens = [i for i in keep_tokens if len(i) > 1]
# stem the tokens
# stemmed_tokens = [p_stemmer.stem(i) for i in keep_tokens]
stemmed_tokens = [
wordnet_lemmatizer.lemmatize(i) for i in keep_tokens
]
# add tokens to list of texts to analyze
texts_to_analyze.append(stemmed_tokens)
# turn the tokenized documents into a id <-> term dictionary
topic_model_dictionary = corpora.Dictionary(texts_to_analyze)
return topic_model_dictionary, texts_to_analyze
def show_topic_model_textually(seed_gensim_topic_model, seed_gensim_corpus,
texts_to_analyze, num_topics):
""" Using only textual output provide a basic display of the topic model """
print("alpha =", seed_gensim_topic_model.alpha)
print(seed_gensim_topic_model)
print(seed_gensim_topic_model.print_topics(num_topics))
print()
def show_topic_model_visually(seed_gensim_topic_model, seed_gensim_corpus,
seed_gensim_dictionary):
""" Using an interactive visualization, provide a display of the topic model """
vis = pyLDAvis.gensim.prepare(seed_gensim_topic_model, seed_gensim_corpus,
seed_gensim_dictionary)
pyLDAvis.show(vis)