diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/Manifest.toml b/Manifest.toml index c704ecc..cf7317f 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,136 +1,5 @@ # This file is machine-generated - editing it directly is not advised -[[Arpack]] -deps = ["Arpack_jll", "Libdl", "LinearAlgebra"] -git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9" -uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" -version = "0.4.0" - -[[Arpack_jll]] -deps = ["Libdl", "OpenBLAS_jll", "Pkg"] -git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195" -uuid = "68821587-b530-5797-8361-c406ea357684" -version = "3.5.0+2" - -[[Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[BinaryProvider]] -deps = ["Libdl", "SHA"] -git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c" -uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.8" - -[[DataAPI]] -git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252" -uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" -version = "1.1.0" - -[[DataStructures]] -deps = ["InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8" -uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.17.10" - -[[Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[Distributed]] -deps = ["Random", "Serialization", "Sockets"] -uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" - -[[Distributions]] -deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"] -git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d" -uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" -version = "0.22.4" - -[[FillArrays]] -deps = ["LinearAlgebra", "Random", "SparseArrays"] -git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66" -uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" -version = "0.8.5" - -[[InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[LibGit2]] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[LinearAlgebra]] -deps = ["Libdl"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[Missings]] -deps = ["DataAPI"] -git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5" -uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -version = "0.4.3" - -[[OpenBLAS_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "e2551d7c25d52f35b76d86a50917a3ba8988f519" -uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.7+5" - -[[OpenSpecFun_jll]] -deps = ["Libdl", "Pkg"] -git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90" -uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" -version = "0.5.3+1" - -[[OrderedCollections]] -deps = ["Random", "Serialization", "Test"] -git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" -uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.1.0" - -[[PDMats]] -deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"] -git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194" -uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" -version = "0.9.11" - -[[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" - -[[Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[QuadGK]] -deps = ["DataStructures", "LinearAlgebra"] -git-tree-sha1 = "dc84e810393cfc6294248c9032a9cdacc14a3db4" -uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" -version = "2.3.1" - -[[REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[Random]] -deps = ["Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[Rmath]] -deps = ["BinaryProvider", "Libdl", "Random", "Statistics"] -git-tree-sha1 = "2bbddcb984a1d08612d0c4abb5b4774883f6fa98" -uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" -version = "0.6.0" - [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" @@ -160,12 +29,6 @@ version = "0.10.0" deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" -[[StatsBase]] -deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] -git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9" -uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.32.1" - [[StatsFuns]] deps = ["Rmath", "SpecialFunctions"] git-tree-sha1 = "f290ddd5fdedeadd10e961eb3f4d3340f09d030a" @@ -186,3 +49,39 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[Zlib_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e" +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.11+8" + +[[libass_jll]] +deps = ["Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "3fd3ea3525f2e3d337c54a52b2ca78a5a272bbf5" +uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0" +version = "0.14.0+0" + +[[libfdk_aac_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "0e4ace600c20714a8dd67700c4502714d8473e8e" +uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280" +version = "0.1.6+1" + +[[libvorbis_jll]] +deps = ["Libdl", "Ogg_jll", "Pkg"] +git-tree-sha1 = "71e54fb89ac3e0344c7185d1876fd96b0f246952" +uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a" +version = "1.3.6+2" + +[[x264_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "23664c0757c3740050ca0e22944c786c165ca25a" +uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a" +version = "2019.5.25+1" + +[[x265_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "9345e417084421a8e91373d6196bc58e660eed2a" +uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76" +version = "3.0.0+0" diff --git a/Project.toml b/Project.toml index 65ba1d2..a3f7a69 100644 --- a/Project.toml +++ b/Project.toml @@ -4,9 +4,12 @@ authors = ["Jonathan Chang "] version = "0.1.0" [deps] +Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +UMAP = "c4f8c510-2410-5be4-91d7-4fbaeb39457e" [compat] julia = "1.3" diff --git a/examples/LDA.jl b/examples/LDA.jl index 16f7183..32027fc 100644 --- a/examples/LDA.jl +++ b/examples/LDA.jl @@ -1,14 +1,46 @@ -using TopicModels +using TopicModels, Plots, UMAP -exdir = Pkg.dir("TopicModels", "examples") +################################################################################################################################## +# Fit and Visualize Real-World Text Data -testDocuments = readDocuments(open(joinpath(exdir, "cora.documents"))) +exdir = joinpath(dirname(pathof(TopicModels)), "..", "examples") + +testDocuments = readDocs(open(joinpath(exdir, "cora.documents"))) testLexicon = readLexicon(open(joinpath(exdir, "cora.lexicon"))) -corpus = Corpus(testDocuments) +corpus = Corpus(testDocuments,testLexicon) +model = Model(fill(0.1, 10), fill(0.01,length(testLexicon)), corpus) +state = State(model,corpus) + +#@time Juno.@run trainModel(model, state, 30) +@time trainModel(model, state, 30) +topWords = topTopicWords(model, state, 10) + +# visualize the fit +@time embedding = umap(state.topics, 2, n_neighbors=10) +maxlabels = vec(map(i->i[1], findmax(state.topics,dims=1)[2])) +scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: Max-Likelihood Doc Topics on Learned", marker=(2, 2, :auto, stroke(0))) + +################################################################################################################################## +# Fit, Validate, and Visualize Synthetic Data Derived from a Fully-Generative Simulation (Poisson-distributed document-length) + +k = 10 +lexLength = 1000 +corpLambda = 1000 # poisson parameter for random doc length +corpLength = 100 +scaleK = 0.01 +scaleL = 0.01 +testCorpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL) -model = Model(fill(0.1, 10), 0.01, length(testLexicon), corpus) +testModel = Model(testCorpus.alpha, testCorpus.beta, testCorpus) +testState = State(testModel, testCorpus) +@time trainModel(testModel, testState, 100) -@time trainModel(model, 30) +# compute validation metrics on a single fit +CorpusARI(testState,testModel,testCorpus) # ARI for max. likelihood. document topics +DocsARI(testState,testCorpus) # ARI for actual word topics -topWords = topTopicWords(model, testLexicon, 21) +# visualize the fit +@time embedding = umap(testState.topics, 2;n_neighbors=10) +maxlabels = vec(map(i->i[1], findmax(CorpusTopics(testCorpus),dims=1)[2])) +scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: True on Learned", marker=(2, 2, :auto, stroke(0))) diff --git a/src/Computation.jl b/src/Computation.jl new file mode 100644 index 0000000..13c8266 --- /dev/null +++ b/src/Computation.jl @@ -0,0 +1,251 @@ +struct Model + alphaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on document topics + betaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on words + corpus::AbstractCorpus + + # initialize an untrained model + Model(alphaPrior::Vector{Float64}, + betaPrior::Vector{Float64}, + corpus::AbstractCorpus) = begin + K = length(alphaPrior) + m = new( + alphaPrior, + betaPrior, + corpus) + return m + end + + # initialize a trained model + Model(trainedModel::Model, + corpus::AbstractCorpus) = begin + m = new( + trainedModel.alphaPrior, + trainedModel.betaPrior, + corpus + ) + return m + end +end + +struct State + topics::Array{Float64,2} + topicSums::Vector{Float64} + docSums::Array{Float64,2} + assignments::Array{Array{Int64,1},1} + conditionals::Array{Array{Float64,2},1} # the p paramter for the word assignment (cat/multinom) variable + frozen::Bool + + # randomly initialize the state + State(model::Model, + corpus::AbstractCorpus) = begin # length of the lexicon + K = length(model.alphaPrior) + s = new( + zeros(Float64, K, length(corpus.lexicon)), # topics + zeros(Float64, K), # topicSums + zeros(Float64, K, length(corpus.docs)), #docSums + fill(Array{Int64,1}(undef,0), length(corpus.docs)), # assignments + fill(Array{Int64,2}(undef,0,K), length(corpus.docs)), + false + ) + initializeAssignments(model,s,corpus) + return s + end + + # initialize the state from a trained model + State(topics::Array{Float64,2}, + topicSums::Vector{Float64}, + docSums::Array{Float64,2}, + assignments::Array{Array{Int64,1},1}, + conditionals::Array{Array{Float64,2},1}, + frozen::Bool) = begin # length of the lexicon + s = new( + topics, + topicSums, + docSums, + assignmens, + conditionals, + frozen + ) + return s + end +end + +function AllTopics(state::State) + alltopics = [] + for i in 1:length(state.assignments) + append!(alltopics,state.assignments[i]) + end + return convert(Array{Int,1},alltopics) +end + +function initializeAssignments(model::Model,state::State,corpus::AbstractCorpus) + for dd in 1:length(corpus) + @inbounds words = corpus.docs[dd].terms + @inbounds state.assignments[dd] = zeros(length(words)) + @inbounds state.conditionals[dd] = zeros(length(words), length(model.alphaPrior)) + for ww in 1:length(words) + @inbounds word = words[ww] + @inbounds state.conditionals[dd][ww,:] = model.alphaPrior + topic = sampleMultinomial(ww,dd,state) + @inbounds state.assignments[dd][ww] = topic + updateSufficientStatistics(word, topic, dd, + model.corpus.weights[dd][ww], + state) + end + end + return +end + + +function sampleMultinomial(word_ind::Int64, + document::Int64, + state::State) + cond = state.conditionals[document][word_ind,:] + pSum = sum(cond) + r = rand() * pSum + K = length(cond) + for k in 1:K + if r < cond[k] + return k + else + @inbounds r -= cond[k] + end + end + return 0 +end + +function cond_word(word::Int, + word_ind::Int, + document::Int, + model::Model, + state::State) + V = size(state.topics, 2) + for ii in 1:length(model.alphaPrior) + @inbounds state.conditionals[document][word_ind,ii] = + (state.docSums[ii, document] + model.alphaPrior[ii]) * + (state.topics[ii, word] + model.betaPrior[word]) / + (state.topicSums[ii] + V * model.betaPrior[word]) + end + return +end + +function log_beta(x::Vector{Float64}) + # compute natural log of the multivariate beta function + lb = sum(loggamma.(x)) + lb -= loggamma(sum(x)) +end + +function joint_log_p(model::Model, + state::State) + #calculate the full joint log likelihood, this is usefull for testing + log_pz = 0 + for k in 1:length(model.alphaPrior) + @inbounds log_pz += (log_beta(state.topics[k,:] .+ model.betaPrior) - + log_beta(model.betaPrior)) + end + for d in 1:length(model.corpus) + @inbounds log_pz += (log_beta(state.docSums[:,d] .+ model.alphaPrior) - + log_beta(model.alphaPrior)) + end + return log_pz +end + +function sampleWord(word::Int, + word_ind::Int, + document::Int, + model::Model, + state::State) + cond_word(word, word_ind, document, model, state) + sampleMultinomial(word_ind, document, state) +end + + +function updateSufficientStatistics(word::Int64, + topic::Int64, + document::Int64, + scale::Float64, + state::State) + fr = Float64(!state.frozen) + @inbounds state.docSums[topic, document] += scale + @inbounds state.topicSums[topic] += scale * fr + @inbounds state.topics[topic, word] += scale * fr + return +end + +@doc raw""" + getTermDist(state::State, model::Model) + +Compute ``\phi_{k,v} = \frac{\Psi_{k,v} + \beta_t}{\left( \sum^V_{v'=1} \Psi_{k,v'} + \beta_{v'}\right)}`` + +Where ``\vec{ϕ_v}`` parameterizes the V-dimensional categorical distribution of a word. + +Updates the `termDist` attribute of `state` +""" +function getTermDist(state::State, model::Model) + Phi = Array{Float64,2}(undef,length(model.alphaPrior),length(model.betaPrior)) + for topic in 1:length(model.alphaPrior) + Phi[topic,:] = (state.topics[topic,:] .+ model.betaPrior) ./ (state.topicSums[topic] + sum(model.betaPrior)) + end + return Phi +end + +@doc raw""" + getTopicDist(state::State, model::Model) + +Compute ``\theta_{k,m} = \frac{\Omega_{k,m} + \alpha_k}{\left( \sum^K_{k'=1} \Omega_{k,m'} + \alpha_{k'}\right)}`` + +Where ``\vec{\theta_m}`` parameterizes the K-dimensional categorical distribution of a document. + +Updates the `topicDist` attribute of `state` +""" +function getTopicDist(state::State, model::Model) + Theta = Array{Float64,2}(undef,length(model.alphaPrior),length(model.corpus)) + for doc in 1:length(model.corpus) + Theta[:,doc] = (state.docSums[:,doc] .+ model.alphaPrior) ./ (sum(state.docSums[:,doc]) + sum(model.alphaPrior)) + end + return Theta +end + +function sampleDocument(document::Int, + model::Model, + state::State) + words = model.corpus.docs[document].terms + Nw = length(words) + @inbounds weights = model.corpus.weights[document] + K = length(model.alphaPrior) + @inbounds assignments = state.assignments[document] + for ii in 1:Nw + word = words[ii] + oldTopic = assignments[ii] + updateSufficientStatistics(word, oldTopic, document, -weights[ii], state) + newTopic = sampleWord(word, ii, document, model, state) + @inbounds assignments[ii] = newTopic + updateSufficientStatistics(word, newTopic, document, weights[ii], state) + end + return +end + +function sampleCorpus(model::Model, state::State) + for ii in 1:length(model.corpus) + sampleDocument(ii, model, state) + end + return +end + +# The functions below are designed for public consumption +function trainModel(model::Model, + state::State, + numIterations::Int64) + for ii in 1:numIterations + println(string("Iteration ", ii, "...")) + sampleCorpus(model, state) + end + return +end + +function topTopicWords(model::Model, + state::State, + numWords::Int64) + [model.corpus.lexicon[reverse(sortperm(state.topics'[1:end, row]))[1:numWords]] + for row in 1:size(state.topics,1)] +end diff --git a/src/Data.jl b/src/Data.jl new file mode 100644 index 0000000..14c8794 --- /dev/null +++ b/src/Data.jl @@ -0,0 +1,163 @@ +### Document +abstract type AbstractDocument end + +struct LdaDocument <: AbstractDocument + # this is a fully observed data from the LDA model + theta::Array{Float64,1} # the topic probs for the doc + z::Array{Int64,1} # the topic for each word + terms::Array{Int64,1} # the word tokens + + LdaDocument(alpha::Array{Float64,1}, + P::Array{Float64,2}, + N::Int64) = begin # length of the doc + d = new( + Array{Float64,1}(undef,size(P,2)), + Array{Int64,1}(undef,N), + Array{Int64,1}(undef,N) + ) + GenerateDoc(d,alpha,P) + return d + end + + LdaDocument(theta::Array{Float64,1}, + z::Array{Int64,1}, + terms::Array{Int64,1}) = begin + d = new(theta,z,N) + return d + end +end + +function GenerateDoc(doc::LdaDocument, + alpha::Array{Float64,1}, + Phi::Array{Float64,2}) + dd = Dirichlet(alpha) + doc.theta .= vec(rand(dd,1)) + cat = Categorical(vec(doc.theta)) + doc.z .= rand(cat,length(doc)) + for i in 1:length(doc) + @inbounds dc = Categorical(Phi[:,doc.z[i]]) + @inbounds doc.terms[i] = rand(dc,1)[1] + end + return +end + +struct Document <: AbstractDocument + #this is actual data, where only the terms are observed + terms::Array{Int64,1} # the word tokens + Document(terms::Array{Int64,1}) = new(terms) +end + +function length(doc::AbstractDocument) + return size(doc.terms,1) +end + +### Corpus +abstract type AbstractCorpus end + +struct LdaCorpus <: AbstractCorpus + # this is a fully observed data from the LDA model + docs::Array{LdaDocument,1} + alpha::Array{Float64,1} + beta::Array{Float64,1} + Phi::Array{Float64,2} + weights::Array{Array{Float64,1}} # only unweighted terms supported + lexicon::Array{String,1} + + LdaCorpus(k::Int64, + lexLength::Int64, + corpLambda::Int64, + corpLength::Int64, + scaleK::Float64, + scaleL::Float64) = begin # length of the doc + w = Array{Array{Float64,1},1}(undef,corpLength) + lex = string.([1:1:lexLength;]) # there is no + a = fill(scaleK,k) # scale parameter for the Dirichlet topic prior + b = fill(scaleL,lexLength) # scale parameter for the Dirichlet token prior + dl = Poisson(corpLambda) + docLengths = rand(dl,corpLength) # the lengths of the docs in the corpus + db = Dirichlet(b) + P = rand(db,k) # the Dirichlet token prior, containing one lexLength vector for each k + d = Array{LdaDocument,1}(undef,corpLength) + for i in 1:corpLength + w[i] = ones(docLengths[i]) + @inbounds d[i] = LdaDocument(a,P,docLengths[i]) + end + return new(d, a, b, P, w, lex) + end + + LdaCorpus(docs::Array{LdaDocument,1}, # the documents + alpha::Array{Float64,1}, + beta::Array{Float64,1}, + Phi::Array{Float64,2}, + weights::Array{Float64,1}, + lexicon::Array{String,1}) = begin + c = new(docs,alpha,beta,Phi,weights) + return c + end +end + +function CorpusTopics(corpus::LdaCorpus) + cat(dims=2,map(i->vec(i.theta), corpus.docs)...) # get a 2d array of (document wise) mixed membership for the corpus +end + +function AllTopics(corpus::LdaCorpus) + alltopics = [] + for i in 1:length(corpus) + append!(alltopics,corpus.docs[i].z) + end + return convert(Array{Int,1},alltopics) +end + +struct Corpus <: AbstractCorpus + docs::Array{Document,1} + weights::Array{Array{Float64,1},1} + lexicon::Array{String,1} + + Corpus(docs::Array{Document,1}, + weights::Array{Array{Float64,1},1}, + lexicon::Array{String,1}) = begin + return new( + docs, + weights, + lexicon + ) + end + + Corpus(docs::Array{Document,1}, + lexicon::Array{String,1}) = begin + return new( + docs, + map(x -> ones(Float64,length(x)), docs), # no weights + lexicon + ) + end +end + +function length(corpus::AbstractCorpus) + return length(corpus.docs) +end + +# Expand a term:count pair into a -length sequence [term, term, ....] +function termToWordSequence(term::AbstractString) + parts = split(term, ":") + fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2])) +end + +function readDocs(stream) + corpus = readlines(stream) + docs = Array{Document,1}(undef,length(corpus)) + for i in 1:length(corpus) + @inbounds terms = split(corpus[i], " ")[2:end] + @inbounds docs[i] = Document(termToWordSequence(terms[1])) + for ii in 2:length(terms) + @inbounds append!(docs[i].terms, termToWordSequence(terms[ii])) + end + end + return docs +end + +function readLexicon(stream) + lines = readlines(stream) + chomped = map(chomp, convert(Array{AbstractString,1}, lines)) + convert(Array{String,1},chomped) # convert from substrings +end diff --git a/src/TopicModels.jl b/src/TopicModels.jl index 2990d26..3c6c9e9 100644 --- a/src/TopicModels.jl +++ b/src/TopicModels.jl @@ -1,210 +1,32 @@ module TopicModels +#Imports import Base.length -RaggedMatrix{T} = Array{Array{T,1},1} - -struct Corpus - documents::RaggedMatrix{Int64} - weights::RaggedMatrix{Float64} - - Corpus(documents::RaggedMatrix{Int64}, - weights::RaggedMatrix{Float64}) = begin - return new( - documents, - weights - ) - end - - Corpus(documents::RaggedMatrix{Int64}) = begin - weights = map(documents) do doc - ones(Float64, length(doc)) - end - return new( - documents, - weights - ) - end -end - -struct Model - alphaPrior::Vector{Float64} - betaPrior::Float64 - topics::Array{Float64,2} - topicSums::Vector{Float64} - documentSums::Array{Float64,2} - assignments::RaggedMatrix{Int64} - frozen::Bool - corpus::Corpus - - Model(alphaPrior::Vector{Float64}, - betaPrior::Float64, - V::Int64, - corpus::Corpus) = begin - K = length(alphaPrior) - m = new( - alphaPrior, - betaPrior, - zeros(Float64, K, V), # topics - zeros(Float64, K), # topicSums - zeros(Float64, K, length(corpus.documents)), #documentSums - Array{Array{Int64,1},1}(undef,length(corpus.documents)), # assignments - false, - corpus - ) - initializeAssignments(m) - return m - end - - Model(trainedModel::Model, corpus::Corpus) = begin - m = new( - trainedModel.alphaPrior, - trainedModel.betaPrior, - trainedModel.topics, - trainedModel.topicSums, - trainedModel.documentSums, - fill(Array(Int64, 0), length(corpus.documents)), - true, - corpus - ) - initializeAssignments(m) - return m - end -end - -function length(corpus::Corpus) - return length(corpus.documents) -end - -function initializeAssignments(model::Model) - for dd in 1:length(model.corpus) - @inbounds words = model.corpus.documents[dd] - @inbounds model.assignments[dd] = fill(0, length(words)) - for ww in 1:length(words) - @inbounds word = words[ww] - topic = sampleMultinomial(model.alphaPrior) - @inbounds model.assignments[dd][ww] = topic - updateSufficientStatistics( - word, topic, dd, model.corpus.weights[dd][ww], model) - end - end - return -end - -function sampleMultinomial(p::Array{Float64,1}) - pSum = sum(p) - r = rand() * pSum - K = length(p) - for k in 1:K - if r < p[k] - return k - else - r -= p[k] - end - end - return 0 -end - -function wordDistribution(word::Int, - document::Int, - model::Model, - out::Vector{Float64}) - V = size(model.topics, 2) - for ii in 1:length(out) - u = (model.documentSums[ii, document] + model.alphaPrior[ii]) * - (model.topics[ii, word] + model.betaPrior) / - (model.topicSums[ii] + V * model.betaPrior) - @inbounds out[ii] = u - end - return -end - -function sampleWord(word::Int, - document::Int, - model::Model, - p::Vector{Float64}) - wordDistribution(word, document, model, p) - sampleMultinomial(p) -end - - -function updateSufficientStatistics(word::Int64, - topic::Int64, - document::Int64, - scale::Float64, - model::Model) - fr = Float64(!model.frozen) - @inbounds model.documentSums[topic, document] += scale - @inbounds model.topicSums[topic] += scale * fr - @inbounds model.topics[topic, word] += scale * fr - return -end - -function sampleDocument(document::Int, - model::Model) - @inbounds words = model.corpus.documents[document] - Nw = length(words) - @inbounds weights = model.corpus.weights[document] - K = length(model.alphaPrior) - p = Array{Float64,1}(undef,K) - @inbounds assignments = model.assignments[document] - for ii in 1:Nw - @inbounds word = words[ii] - @inbounds oldTopic = assignments[ii] - updateSufficientStatistics(word, oldTopic, document, -weights[ii], model) - newTopic = sampleWord(word, document, model, p) - @inbounds assignments[ii] = newTopic - updateSufficientStatistics(word, newTopic, document, weights[ii], model) - end - return -end - -function sampleCorpus(model::Model) - for ii in 1:length(model.corpus) - sampleDocument(ii, model) - end - return -end - -# Note, files are zero indexed, but we are 1-indexed. -function termToWordSequence(term::AbstractString) - parts = split(term, ":") - fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2])) -end - -# The functions below are designed for public consumption -function trainModel(model::Model, - numIterations::Int64) - for ii in 1:numIterations - println(string("Iteration ", ii, "...")) - sampleCorpus(model) - end - return -end - -function topTopicWords(model::Model, - lexicon::Array{String,1}, - numWords::Int64) - [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]] - for row in 1:size(model.topics,1)] -end - -function readDocuments(stream) - lines = readlines(stream) - convert(RaggedMatrix{Int64}, - [vcat([termToWordSequence(term) for term in split(line, " ")[2:end]]...) - for line in lines]) -end - -function readLexicon(stream) - lines = readlines(stream) - convert(Array{String,1},map(chomp, convert(Array{AbstractString,1}, lines))) -end +using Random, Distributions, Plots, UMAP +using SpecialFunctions: loggamma +using Clustering: randindex +#Exports export Corpus, + LdaCorpus, Model, - readDocuments, + State, + readDocs, readLexicon, + termToWordSequence, topTopicWords, - trainModel -end + trainModel, + CorpusTopics, + CorpusARI, + DocsARI + +#Data that we make or find in real life: +include("Data.jl") + +#Bayesian learning and inference: +include("Computation.jl") + +#Stuff like perplexity and ARI: +include("Validation.jl") +end #module \ No newline at end of file diff --git a/src/Validation.jl b/src/Validation.jl new file mode 100644 index 0000000..57710e5 --- /dev/null +++ b/src/Validation.jl @@ -0,0 +1,15 @@ +function CorpusARI(state::State,model::Model,corpus::LdaCorpus) + #for synthetic data, turn our mixed membership document vectors into max likelihood assignments + # and check ARI between the ground truth and the state + + learned_max_clust = map(i->i[1], findmax(getTopicDist(state,model),dims=1)[2]) + true_max_clust = map(i->i[1], findmax(CorpusTopics(corpus),dims=1)[2]) + randindex(learned_max_clust,true_max_clust) +end + +function DocsARI(state::State,corpus::LdaCorpus) + #for synthetic data, find the topic ARI across [all terms in] all documents in the corpus + learned_clust = AllTopics(state) + true_clust = AllTopics(corpus) + randindex(learned_clust,true_clust) +end diff --git a/test/Gibbs_unit_tests.jl b/test/Gibbs_unit_tests.jl new file mode 100644 index 0000000..fda7937 --- /dev/null +++ b/test/Gibbs_unit_tests.jl @@ -0,0 +1,54 @@ +using Test, TopicModels, Random +using TopicModels: updateSufficientStatistics, joint_log_p #non-exported fns we need + + +# use the equality of likelihood ratio to test that the conditional distribution is consistent with the joint distribution +@testset "LDA docs" begin + # generate some data from LDA where the doclength is Poisson + k = 7 + lexLength = 10 + corpLambda = 10 # poisson parameter for random doc length + corpLength = 10 + scaleK = 0.1 + scaleL = 0.1 + Random.seed!(123) + + corpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL) + + model = Model(corpus.alpha, corpus.beta, corpus) + state = State(model, corpus) + trainModel(model, state, 10) # update all the state variables + + # pick a random doc/word to iterate the sampler + doc_ind = rand(1:corpLength) + word_ind = rand(1:length(corpus.docs[doc_ind])) + word = corpus.docs[doc_ind].terms[word_ind] + + conditional = state.conditionals[doc_ind][word_ind,:] + oldTopic = copy(state.assignments[doc_ind][word_ind]) # the original word token + + newTopic = rand(collect(1:k)[1:end .!= oldTopic],1) # a different word token + newTopic = Int64(newTopic[1]) + + #get the original state probs + joint_Lw = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for + cond_Lw = log(state.conditionals[doc_ind][word_ind,oldTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...) + cond_Lw_new = log(state.conditionals[doc_ind][word_ind,newTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...) + + updateSufficientStatistics(word, oldTopic, doc_ind, -model.corpus.weights[doc_ind][word_ind], state) #remove counts for the old topic + updateSufficientStatistics(word, newTopic, doc_ind, model.corpus.weights[doc_ind][word_ind], state) #update stats for new topic + joint_Lw_new = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for + + print("joint_Lw: ", joint_Lw, "\n") + print("cond_Lw: ", cond_Lw, "\n") + + print("joint_Lw_new: ", joint_Lw_new, "\n") + print("cond_Lw_new: ", cond_Lw_new, "\n") + + print("joint_LR: ", joint_Lw_new-joint_Lw, "\n") + print("cond_LR: ", cond_Lw_new-cond_Lw, "\n") + print("old Topic: ", oldTopic, "\n") + print("new Topic: ", newTopic, "\n") + + @test isless(abs(joint_Lw_new-joint_Lw - cond_Lw_new+cond_Lw),1e-5) +end pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy