Skip to content

Commit 5bb9282

Browse files
author
Jonathan Chang
committed
Improve performance substantially. Include support for weights
1 parent 71dc35a commit 5bb9282

File tree

1 file changed

+103
-45
lines changed

1 file changed

+103
-45
lines changed

src/TopicModels.jl

Lines changed: 103 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,94 @@
11
module TopicModels
22

3-
typealias RaggedMatrix{T} Array{Array{Int64,1},1}
4-
typealias Corpus RaggedMatrix{Int64}
3+
import Base.length
4+
5+
typealias RaggedMatrix{T} Array{Array{T,1},1}
6+
7+
type Corpus
8+
documents::RaggedMatrix{Int64}
9+
weights::RaggedMatrix{Float64}
10+
11+
Corpus(documents::RaggedMatrix{Int64},
12+
weights::RaggedMatrix{Float64}) = begin
13+
return new(
14+
documents,
15+
weights
16+
)
17+
end
18+
19+
Corpus(documents::RaggedMatrix{Int64}) = begin
20+
weights = map(documents) do doc
21+
ones(Float64, length(doc))
22+
end
23+
return new(
24+
documents,
25+
weights
26+
)
27+
end
28+
end
529

630
type Model
7-
alphaPrior::Array{Float64,1}
31+
alphaPrior::Vector{Float64}
832
betaPrior::Float64
9-
topics::Array{Int64,2}
10-
topicSums::Array{Int64,1}
11-
documentSums::Array{Int64,2}
33+
topics::Array{Float64,2}
34+
topicSums::Vector{Float64}
35+
documentSums::Array{Float64,2}
1236
assignments::RaggedMatrix{Int64}
37+
frozen::Bool
38+
corpus::Corpus
1339

14-
Model(alphaPrior::Array{Float64,1},
40+
Model(alphaPrior::Vector{Float64},
1541
betaPrior::Float64,
1642
V::Int64,
1743
corpus::Corpus) = begin
1844
K = length(alphaPrior)
1945
m = new(
2046
alphaPrior,
2147
betaPrior,
22-
zeros(Int64, K, V), # topics
23-
zeros(Int64, K), # topicSums
24-
zeros(Int64, K, length(corpus)), #documentSums
25-
fill(Array(Int64, 0), length(corpus)) # assignments
48+
zeros(Float64, K, V), # topics
49+
zeros(Float64, K), # topicSums
50+
zeros(Float64, K, length(corpus.documents)), #documentSums
51+
fill(Array(Int64, 0), length(corpus.documents)), # assignments
52+
false,
53+
corpus
2654
)
27-
for dd in 1:length(corpus)
28-
m.assignments[dd] = fill(0, length(corpus[dd]))
29-
for ww in 1:length(corpus[dd])
30-
word = corpus[dd][ww]
31-
topic = sampleMultinomial(alphaPrior)
32-
m.assignments[dd][ww] = topic
33-
updateSufficientStatistics(word, topic, dd, 1, m)
34-
end
35-
end
55+
initializeAssignments(m)
56+
return m
57+
end
58+
59+
Model(trainedModel::Model, corpus::Corpus) = begin
60+
m = new(
61+
trainedModel.alphaPrior,
62+
trainedModel.betaPrior,
63+
trainedModel.topics,
64+
trainedModel.topicSums,
65+
trainedModel.documentSums,
66+
fill(Array(Int64, 0), length(corpus.documents)),
67+
true,
68+
corpus
69+
)
70+
initializeAssignments(m)
3671
return m
3772
end
3873
end
3974

75+
function length(corpus::Corpus)
76+
return length(corpus.documents)
77+
end
78+
79+
function initializeAssignments(model::Model)
80+
for dd in 1:length(model.corpus)
81+
model.assignments[dd] = fill(0, length(model.corpus.documents[dd]))
82+
for ww in 1:length(model.corpus.documents[dd])
83+
word = model.corpus.documents[dd][ww]
84+
topic = sampleMultinomial(model.alphaPrior)
85+
model.assignments[dd][ww] = topic
86+
updateSufficientStatistics(
87+
word, topic, dd, model.corpus.weights[dd][ww], model)
88+
end
89+
end
90+
end
91+
4092
function sampleMultinomial(p::Array{Float64,1})
4193
pSum = sum(p)
4294
r = rand() * pSum
@@ -53,49 +105,56 @@ end
53105

54106
function wordDistribution(word::Int,
55107
document::Int,
56-
model::Model)
108+
model::Model,
109+
out::Vector{Float64})
57110
V = size(model.topics, 2)
58-
(model.documentSums[1:end,document] + model.alphaPrior) .*
59-
(model.topics[1:end, word] + model.betaPrior) ./
60-
(model.topicSums + V * model.betaPrior)
111+
for ii in 1:length(out)
112+
out[ii] = (model.documentSums[ii, document] + model.alphaPrior[ii]) *
113+
(model.topics[ii, word] + model.betaPrior) /
114+
(model.topicSums[ii] + V * model.betaPrior)
115+
end
116+
return out
61117
end
62118

63119
function sampleWord(word::Int,
64120
document::Int,
65-
model::Model)
66-
p = wordDistribution(word, document, model)
121+
model::Model,
122+
p::Vector{Float64})
123+
wordDistribution(word, document, model, p)
67124
sampleMultinomial(p)
68125
end
69126

70127

71-
function updateSufficientStatistics(word::Int,
72-
topic::Int,
73-
document::Int,
74-
scale::Int,
128+
function updateSufficientStatistics(word::Int64,
129+
topic::Int64,
130+
document::Int64,
131+
scale::Float64,
75132
model::Model)
76-
model.topics[topic, word] += scale
77-
model.topicSums[topic] += scale
78133
model.documentSums[topic, document] += scale
134+
model.topicSums[topic] += scale * !model.frozen
135+
model.topics[topic, word] += scale * !model.frozen
79136
end
80137

81-
function sampleDocument(words::Array{Int64,1},
82-
document::Int,
83-
model::Model)
138+
function sampleDocument(document::Int,
139+
model::Model)
140+
words = model.corpus.documents[document]
84141
Nw = length(words)
142+
weights = model.corpus.weights[document]
143+
K = length(model.alphaPrior)
144+
p = Array(Float64, K)
85145
for ii in 1:Nw
86146
word = words[ii]
87147
oldTopic = model.assignments[document][ii]
88-
updateSufficientStatistics(word, oldTopic, document, -1, model)
89-
newTopic = sampleWord(word, document, model)
148+
updateSufficientStatistics(word, oldTopic, document, -weights[ii], model)
149+
newTopic::Int64 = sampleWord(word, document, model, p)
90150
model.assignments[document][ii] = newTopic
91-
updateSufficientStatistics(word, newTopic, document, 1, model)
151+
updateSufficientStatistics(word, newTopic, document, weights[ii], model)
92152
end
93153
end
94154

95-
function sampleCorpus(corpus::Corpus,
96-
model::Model)
97-
for ii in 1:length(corpus)
98-
sampleDocument(corpus[ii], ii, model)
155+
function sampleCorpus(model::Model)
156+
for ii in 1:length(model.corpus)
157+
sampleDocument(ii, model)
99158
end
100159
end
101160

@@ -106,12 +165,11 @@ function termToWordSequence(term::String)
106165
end
107166

108167
# The functions below are designed for public consumption
109-
function trainModel(corpus::Corpus,
110-
model::Model,
168+
function trainModel(model::Model,
111169
numIterations::Int64)
112170
for ii in 1:numIterations
113171
println(string("Iteration ", ii, "..."))
114-
sampleCorpus(corpus, model)
172+
sampleCorpus(model)
115173
end
116174
end
117175

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy