diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e43b0f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/Manifest.toml b/Manifest.toml
index c704ecc..cf7317f 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -1,136 +1,5 @@
 # This file is machine-generated - editing it directly is not advised
 
-[[Arpack]]
-deps = ["Arpack_jll", "Libdl", "LinearAlgebra"]
-git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9"
-uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97"
-version = "0.4.0"
-
-[[Arpack_jll]]
-deps = ["Libdl", "OpenBLAS_jll", "Pkg"]
-git-tree-sha1 = "68a90a692ddc0eb72d69a6993ca26e2a923bf195"
-uuid = "68821587-b530-5797-8361-c406ea357684"
-version = "3.5.0+2"
-
-[[Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[BinaryProvider]]
-deps = ["Libdl", "SHA"]
-git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
-uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-version = "0.5.8"
-
-[[DataAPI]]
-git-tree-sha1 = "674b67f344687a88310213ddfa8a2b3c76cc4252"
-uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
-version = "1.1.0"
-
-[[DataStructures]]
-deps = ["InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8"
-uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.17.10"
-
-[[Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
-uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-
-[[Distributions]]
-deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsBase", "StatsFuns"]
-git-tree-sha1 = "6b19601c0e98de3a8964ed33ad73e130c7165b1d"
-uuid = "31c24e10-a181-5473-b8eb-7969acd0382f"
-version = "0.22.4"
-
-[[FillArrays]]
-deps = ["LinearAlgebra", "Random", "SparseArrays"]
-git-tree-sha1 = "85c6b57e2680fa28d5c8adc798967377646fbf66"
-uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "0.8.5"
-
-[[InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[LibGit2]]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[LinearAlgebra]]
-deps = ["Libdl"]
-uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-
-[[Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[Missings]]
-deps = ["DataAPI"]
-git-tree-sha1 = "de0a5ce9e5289f27df672ffabef4d1e5861247d5"
-uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-version = "0.4.3"
-
-[[OpenBLAS_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "e2551d7c25d52f35b76d86a50917a3ba8988f519"
-uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.7+5"
-
-[[OpenSpecFun_jll]]
-deps = ["Libdl", "Pkg"]
-git-tree-sha1 = "65f672edebf3f4e613ddf37db9dcbd7a407e5e90"
-uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
-version = "0.5.3+1"
-
-[[OrderedCollections]]
-deps = ["Random", "Serialization", "Test"]
-git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
-uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.1.0"
-
-[[PDMats]]
-deps = ["Arpack", "LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"]
-git-tree-sha1 = "5f303510529486bb02ac4d70da8295da38302194"
-uuid = "90014a1f-27ba-587c-ab20-58faa44d9150"
-version = "0.9.11"
-
-[[Pkg]]
-deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-
-[[Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[QuadGK]]
-deps = ["DataStructures", "LinearAlgebra"]
-git-tree-sha1 = "dc84e810393cfc6294248c9032a9cdacc14a3db4"
-uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
-version = "2.3.1"
-
-[[REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[Random]]
-deps = ["Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[Rmath]]
-deps = ["BinaryProvider", "Libdl", "Random", "Statistics"]
-git-tree-sha1 = "2bbddcb984a1d08612d0c4abb5b4774883f6fa98"
-uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa"
-version = "0.6.0"
-
 [[SHA]]
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 
@@ -160,12 +29,6 @@ version = "0.10.0"
 deps = ["LinearAlgebra", "SparseArrays"]
 uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
-[[StatsBase]]
-deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"]
-git-tree-sha1 = "be5c7d45daa449d12868f4466dbf5882242cf2d9"
-uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.32.1"
-
 [[StatsFuns]]
 deps = ["Rmath", "SpecialFunctions"]
 git-tree-sha1 = "f290ddd5fdedeadd10e961eb3f4d3340f09d030a"
@@ -186,3 +49,39 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [[Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[Zlib_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "fd36a6739e256527287c5444960d0266712cd49e"
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.11+8"
+
+[[libass_jll]]
+deps = ["Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "Libdl", "Pkg", "Zlib_jll"]
+git-tree-sha1 = "3fd3ea3525f2e3d337c54a52b2ca78a5a272bbf5"
+uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0"
+version = "0.14.0+0"
+
+[[libfdk_aac_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "0e4ace600c20714a8dd67700c4502714d8473e8e"
+uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280"
+version = "0.1.6+1"
+
+[[libvorbis_jll]]
+deps = ["Libdl", "Ogg_jll", "Pkg"]
+git-tree-sha1 = "71e54fb89ac3e0344c7185d1876fd96b0f246952"
+uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a"
+version = "1.3.6+2"
+
+[[x264_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "23664c0757c3740050ca0e22944c786c165ca25a"
+uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a"
+version = "2019.5.25+1"
+
+[[x265_jll]]
+deps = ["Libdl", "Pkg"]
+git-tree-sha1 = "9345e417084421a8e91373d6196bc58e660eed2a"
+uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76"
+version = "3.0.0+0"
diff --git a/Project.toml b/Project.toml
index 65ba1d2..a3f7a69 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,9 +4,12 @@ authors = ["Jonathan Chang <slycoder @gmail.com>"]
 version = "0.1.0"
 
 [deps]
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+UMAP = "c4f8c510-2410-5be4-91d7-4fbaeb39457e"
 
 [compat]
 julia = "1.3"
diff --git a/examples/LDA.jl b/examples/LDA.jl
index 16f7183..32027fc 100644
--- a/examples/LDA.jl
+++ b/examples/LDA.jl
@@ -1,14 +1,46 @@
-using TopicModels
+using TopicModels, Plots, UMAP
 
-exdir = Pkg.dir("TopicModels", "examples")
+##################################################################################################################################
+# Fit and Visualize Real-World Text Data
 
-testDocuments = readDocuments(open(joinpath(exdir, "cora.documents")))
+exdir = joinpath(dirname(pathof(TopicModels)), "..", "examples")
+
+testDocuments = readDocs(open(joinpath(exdir, "cora.documents")))
 testLexicon = readLexicon(open(joinpath(exdir, "cora.lexicon")))
 
-corpus = Corpus(testDocuments)
+corpus = Corpus(testDocuments,testLexicon)
+model = Model(fill(0.1, 10), fill(0.01,length(testLexicon)), corpus)
+state = State(model,corpus)
+
+#@time Juno.@run trainModel(model, state, 30)
+@time trainModel(model, state, 30)
+topWords = topTopicWords(model, state, 10)
+
+# visualize the fit
+@time embedding = umap(state.topics, 2, n_neighbors=10)
+maxlabels = vec(map(i->i[1], findmax(state.topics,dims=1)[2]))
+scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: Max-Likelihood Doc Topics on Learned", marker=(2, 2, :auto, stroke(0)))
+
+##################################################################################################################################
+# Fit, Validate, and Visualize Synthetic Data Derived from a Fully-Generative Simulation (Poisson-distributed document-length)
+
+k = 10
+lexLength = 1000
+corpLambda = 1000 # poisson parameter for random doc length
+corpLength = 100
+scaleK = 0.01
+scaleL = 0.01
+testCorpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL)
 
-model = Model(fill(0.1, 10), 0.01, length(testLexicon), corpus)
+testModel = Model(testCorpus.alpha, testCorpus.beta, testCorpus)
+testState = State(testModel, testCorpus)
+@time trainModel(testModel, testState, 100)
 
-@time trainModel(model, 30)
+# compute validation metrics on a single fit
+CorpusARI(testState,testModel,testCorpus) # ARI for max. likelihood. document topics
+DocsARI(testState,testCorpus) # ARI for actual word topics
 
-topWords = topTopicWords(model, testLexicon, 21)
+# visualize the fit
+@time embedding = umap(testState.topics, 2;n_neighbors=10)
+maxlabels = vec(map(i->i[1], findmax(CorpusTopics(testCorpus),dims=1)[2]))
+scatter(embedding[1,:], embedding[2,:], zcolor=maxlabels, title="UMAP: True on Learned", marker=(2, 2, :auto, stroke(0)))
diff --git a/src/Computation.jl b/src/Computation.jl
new file mode 100644
index 0000000..13c8266
--- /dev/null
+++ b/src/Computation.jl
@@ -0,0 +1,251 @@
+struct Model
+  alphaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on document topics
+  betaPrior::Vector{Float64} # concentration parameter for the symmetric Dirichlet prior on words
+  corpus::AbstractCorpus
+
+  # initialize an untrained model
+  Model(alphaPrior::Vector{Float64},
+        betaPrior::Vector{Float64},
+        corpus::AbstractCorpus) = begin
+    K = length(alphaPrior)
+    m = new(
+      alphaPrior,
+      betaPrior,
+      corpus)
+    return m
+  end
+
+  # initialize a trained model
+  Model(trainedModel::Model,
+        corpus::AbstractCorpus) = begin
+    m = new(
+      trainedModel.alphaPrior,
+      trainedModel.betaPrior,
+      corpus
+    )
+    return m
+  end
+end
+
+struct State
+  topics::Array{Float64,2}
+  topicSums::Vector{Float64}
+  docSums::Array{Float64,2}
+  assignments::Array{Array{Int64,1},1}
+  conditionals::Array{Array{Float64,2},1} # the p paramter for the word assignment (cat/multinom) variable
+  frozen::Bool
+
+  # randomly initialize the state
+  State(model::Model,
+        corpus::AbstractCorpus) = begin # length of the lexicon
+    K = length(model.alphaPrior)
+    s = new(
+      zeros(Float64, K, length(corpus.lexicon)), # topics
+      zeros(Float64, K), # topicSums
+      zeros(Float64, K, length(corpus.docs)), #docSums
+      fill(Array{Int64,1}(undef,0), length(corpus.docs)), # assignments
+      fill(Array{Int64,2}(undef,0,K), length(corpus.docs)),
+      false
+    )
+    initializeAssignments(model,s,corpus)
+    return s
+  end
+
+  # initialize the state from a trained model
+  State(topics::Array{Float64,2},
+        topicSums::Vector{Float64},
+        docSums::Array{Float64,2},
+        assignments::Array{Array{Int64,1},1},
+        conditionals::Array{Array{Float64,2},1},
+        frozen::Bool) = begin # length of the lexicon
+    s = new(
+      topics,
+      topicSums,
+      docSums,
+      assignmens,
+      conditionals,
+      frozen
+    )
+    return s
+  end
+end
+
+function AllTopics(state::State)
+  alltopics = []
+  for i in 1:length(state.assignments)
+    append!(alltopics,state.assignments[i])
+  end
+  return convert(Array{Int,1},alltopics)
+end
+
+function initializeAssignments(model::Model,state::State,corpus::AbstractCorpus)
+  for dd in 1:length(corpus)
+    @inbounds words = corpus.docs[dd].terms
+    @inbounds state.assignments[dd] = zeros(length(words))
+    @inbounds state.conditionals[dd] = zeros(length(words), length(model.alphaPrior))
+    for ww in 1:length(words)
+      @inbounds word = words[ww]
+      @inbounds state.conditionals[dd][ww,:] = model.alphaPrior
+      topic = sampleMultinomial(ww,dd,state)
+      @inbounds state.assignments[dd][ww] = topic
+      updateSufficientStatistics(word, topic, dd,
+                                  model.corpus.weights[dd][ww],
+                                  state)
+    end
+  end
+  return
+end
+
+
+function sampleMultinomial(word_ind::Int64,
+                           document::Int64,
+                           state::State)
+  cond = state.conditionals[document][word_ind,:]
+  pSum = sum(cond)
+  r = rand() * pSum
+  K = length(cond)
+  for k in 1:K
+    if r < cond[k]
+      return k
+    else
+      @inbounds r -= cond[k]
+    end
+  end
+  return 0
+end
+
+function cond_word(word::Int,
+                   word_ind::Int,
+                   document::Int,
+                   model::Model,
+                   state::State)
+  V = size(state.topics, 2)
+  for ii in 1:length(model.alphaPrior)
+    @inbounds state.conditionals[document][word_ind,ii] =
+        (state.docSums[ii, document] + model.alphaPrior[ii]) *
+        (state.topics[ii, word] + model.betaPrior[word]) /
+        (state.topicSums[ii] + V * model.betaPrior[word])
+  end
+  return
+end
+
+function log_beta(x::Vector{Float64})
+  # compute natural log of the multivariate beta function
+  lb = sum(loggamma.(x))
+  lb -= loggamma(sum(x))
+end
+
+function joint_log_p(model::Model,
+                     state::State)
+  #calculate the full joint log likelihood, this is usefull for testing
+  log_pz = 0
+  for k in 1:length(model.alphaPrior)
+    @inbounds log_pz += (log_beta(state.topics[k,:] .+ model.betaPrior) -
+                log_beta(model.betaPrior))
+  end
+  for d in 1:length(model.corpus)
+    @inbounds log_pz += (log_beta(state.docSums[:,d] .+ model.alphaPrior) -
+                log_beta(model.alphaPrior))
+  end
+  return log_pz
+end
+
+function sampleWord(word::Int,
+                    word_ind::Int,
+                    document::Int,
+                    model::Model,
+                    state::State)
+  cond_word(word, word_ind, document, model, state)
+  sampleMultinomial(word_ind, document, state)
+end
+
+
+function updateSufficientStatistics(word::Int64,
+                                    topic::Int64,
+                                    document::Int64,
+                                    scale::Float64,
+                                    state::State)
+  fr = Float64(!state.frozen)
+  @inbounds state.docSums[topic, document] += scale
+  @inbounds state.topicSums[topic] += scale * fr
+  @inbounds state.topics[topic, word] += scale * fr
+  return
+end
+
+@doc raw"""
+    getTermDist(state::State, model::Model)
+
+Compute ``\phi_{k,v} = \frac{\Psi_{k,v} + \beta_t}{\left( \sum^V_{v'=1} \Psi_{k,v'} + \beta_{v'}\right)}``
+
+Where ``\vec{ϕ_v}`` parameterizes the V-dimensional categorical distribution of a word.
+
+Updates the `termDist` attribute of `state`
+"""
+function getTermDist(state::State, model::Model)
+  Phi = Array{Float64,2}(undef,length(model.alphaPrior),length(model.betaPrior))
+  for topic in 1:length(model.alphaPrior)
+    Phi[topic,:] = (state.topics[topic,:] .+ model.betaPrior) ./ (state.topicSums[topic] + sum(model.betaPrior))
+  end
+  return Phi
+end
+
+@doc raw"""
+    getTopicDist(state::State, model::Model)
+
+Compute ``\theta_{k,m} = \frac{\Omega_{k,m} + \alpha_k}{\left( \sum^K_{k'=1} \Omega_{k,m'} + \alpha_{k'}\right)}``
+
+Where ``\vec{\theta_m}`` parameterizes the K-dimensional categorical distribution of a document.
+
+Updates the `topicDist` attribute of `state`
+"""
+function getTopicDist(state::State, model::Model)
+  Theta = Array{Float64,2}(undef,length(model.alphaPrior),length(model.corpus))
+  for doc in 1:length(model.corpus)
+    Theta[:,doc] = (state.docSums[:,doc] .+ model.alphaPrior) ./ (sum(state.docSums[:,doc]) + sum(model.alphaPrior))
+  end
+  return Theta
+end
+
+function sampleDocument(document::Int,
+                        model::Model,
+                        state::State)
+  words = model.corpus.docs[document].terms
+  Nw = length(words)
+  @inbounds weights = model.corpus.weights[document]
+  K = length(model.alphaPrior)
+  @inbounds assignments = state.assignments[document]
+  for ii in 1:Nw
+    word = words[ii]
+    oldTopic = assignments[ii]
+    updateSufficientStatistics(word, oldTopic, document, -weights[ii], state)
+    newTopic = sampleWord(word, ii, document, model, state)
+    @inbounds assignments[ii] = newTopic
+    updateSufficientStatistics(word, newTopic, document, weights[ii], state)
+  end
+  return
+end
+
+function sampleCorpus(model::Model, state::State)
+  for ii in 1:length(model.corpus)
+    sampleDocument(ii, model, state)
+  end
+  return
+end
+
+# The functions below are designed for public consumption
+function trainModel(model::Model,
+                    state::State,
+                    numIterations::Int64)
+  for ii in 1:numIterations
+    println(string("Iteration ", ii, "..."))
+    sampleCorpus(model, state)
+  end
+  return
+end
+
+function topTopicWords(model::Model,
+                       state::State,
+                       numWords::Int64)
+  [model.corpus.lexicon[reverse(sortperm(state.topics'[1:end, row]))[1:numWords]]
+   for row in 1:size(state.topics,1)]
+end
diff --git a/src/Data.jl b/src/Data.jl
new file mode 100644
index 0000000..14c8794
--- /dev/null
+++ b/src/Data.jl
@@ -0,0 +1,163 @@
+### Document
+abstract type AbstractDocument end
+
+struct LdaDocument <: AbstractDocument
+  # this is a fully observed data from the LDA model
+  theta::Array{Float64,1} # the topic probs for the doc
+  z::Array{Int64,1} # the topic for each word
+  terms::Array{Int64,1} # the word tokens
+
+  LdaDocument(alpha::Array{Float64,1},
+                  P::Array{Float64,2},
+                  N::Int64) = begin # length of the doc
+    d = new(
+      Array{Float64,1}(undef,size(P,2)),
+      Array{Int64,1}(undef,N),
+      Array{Int64,1}(undef,N)
+    )
+    GenerateDoc(d,alpha,P)
+    return d
+  end
+
+  LdaDocument(theta::Array{Float64,1},
+              z::Array{Int64,1},
+              terms::Array{Int64,1}) = begin
+    d = new(theta,z,N)
+    return d
+  end
+end
+
+function GenerateDoc(doc::LdaDocument,
+                     alpha::Array{Float64,1},
+                     Phi::Array{Float64,2})
+    dd = Dirichlet(alpha)
+    doc.theta .= vec(rand(dd,1))
+    cat = Categorical(vec(doc.theta))
+    doc.z .= rand(cat,length(doc))
+    for i in 1:length(doc)
+        @inbounds dc = Categorical(Phi[:,doc.z[i]])
+        @inbounds doc.terms[i] = rand(dc,1)[1]
+    end
+    return
+end
+
+struct Document <: AbstractDocument
+  #this is actual data, where only the terms are observed
+  terms::Array{Int64,1} # the word tokens
+  Document(terms::Array{Int64,1}) = new(terms)
+end
+
+function length(doc::AbstractDocument)
+  return size(doc.terms,1)
+end
+
+### Corpus
+abstract type AbstractCorpus end
+
+struct LdaCorpus <: AbstractCorpus
+  # this is a fully observed data from the LDA model
+  docs::Array{LdaDocument,1}
+  alpha::Array{Float64,1}
+  beta::Array{Float64,1}
+  Phi::Array{Float64,2}
+  weights::Array{Array{Float64,1}} # only unweighted terms supported
+  lexicon::Array{String,1}
+
+  LdaCorpus(k::Int64,
+            lexLength::Int64,
+            corpLambda::Int64,
+            corpLength::Int64,
+            scaleK::Float64,
+            scaleL::Float64) = begin # length of the doc
+    w = Array{Array{Float64,1},1}(undef,corpLength)
+    lex = string.([1:1:lexLength;]) # there is no
+    a = fill(scaleK,k) # scale parameter for the Dirichlet topic prior
+    b = fill(scaleL,lexLength) # scale parameter for the Dirichlet token prior
+    dl = Poisson(corpLambda)
+    docLengths = rand(dl,corpLength) # the lengths of the docs in the corpus
+    db = Dirichlet(b)
+    P = rand(db,k) # the Dirichlet token prior, containing one lexLength vector for each k
+    d = Array{LdaDocument,1}(undef,corpLength)
+    for i in 1:corpLength
+      w[i] = ones(docLengths[i])
+      @inbounds d[i] = LdaDocument(a,P,docLengths[i])
+    end
+    return new(d, a, b, P, w, lex)
+  end
+
+  LdaCorpus(docs::Array{LdaDocument,1}, # the documents
+            alpha::Array{Float64,1},
+            beta::Array{Float64,1},
+            Phi::Array{Float64,2},
+            weights::Array{Float64,1},
+            lexicon::Array{String,1}) = begin
+    c = new(docs,alpha,beta,Phi,weights)
+    return c
+  end
+end
+
+function CorpusTopics(corpus::LdaCorpus)
+  cat(dims=2,map(i->vec(i.theta), corpus.docs)...) # get a 2d array of (document wise) mixed membership for the corpus
+end
+
+function AllTopics(corpus::LdaCorpus)
+  alltopics = []
+  for i in 1:length(corpus)
+    append!(alltopics,corpus.docs[i].z)
+  end
+  return convert(Array{Int,1},alltopics)
+end
+
+struct Corpus <: AbstractCorpus
+  docs::Array{Document,1}
+  weights::Array{Array{Float64,1},1}
+  lexicon::Array{String,1}
+
+  Corpus(docs::Array{Document,1},
+         weights::Array{Array{Float64,1},1},
+         lexicon::Array{String,1}) = begin
+    return new(
+      docs,
+      weights,
+      lexicon
+    )
+  end
+
+  Corpus(docs::Array{Document,1},
+         lexicon::Array{String,1}) = begin
+    return new(
+      docs,
+      map(x -> ones(Float64,length(x)), docs), # no weights
+      lexicon
+    )
+  end
+end
+
+function length(corpus::AbstractCorpus)
+  return length(corpus.docs)
+end
+
+# Expand  a term:count pair into a <count>-length sequence [term, term, ....]
+function termToWordSequence(term::AbstractString)
+  parts = split(term, ":")
+  fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2]))
+end
+
+function readDocs(stream)
+    corpus = readlines(stream)
+    docs = Array{Document,1}(undef,length(corpus))
+    for i in 1:length(corpus)
+      @inbounds terms = split(corpus[i], " ")[2:end]
+      @inbounds docs[i] = Document(termToWordSequence(terms[1]))
+      for ii in 2:length(terms)
+        @inbounds append!(docs[i].terms, termToWordSequence(terms[ii]))
+      end
+    end
+    return docs
+end
+
+function readLexicon(stream)
+  lines = readlines(stream)
+  chomped = map(chomp, convert(Array{AbstractString,1}, lines))
+  convert(Array{String,1},chomped) # convert from substrings
+end
diff --git a/src/TopicModels.jl b/src/TopicModels.jl
index 2990d26..3c6c9e9 100644
--- a/src/TopicModels.jl
+++ b/src/TopicModels.jl
@@ -1,210 +1,32 @@
 module TopicModels
 
+#Imports
 import Base.length
 
-RaggedMatrix{T} = Array{Array{T,1},1}
-
-struct Corpus
-  documents::RaggedMatrix{Int64}
-  weights::RaggedMatrix{Float64}
-
-  Corpus(documents::RaggedMatrix{Int64},
-         weights::RaggedMatrix{Float64}) = begin
-    return new(
-      documents,
-      weights
-    )
-  end
-
-  Corpus(documents::RaggedMatrix{Int64}) = begin
-    weights = map(documents) do doc
-      ones(Float64, length(doc))
-    end
-    return new(
-      documents,
-      weights
-    )
-  end
-end
-
-struct Model
-  alphaPrior::Vector{Float64}
-  betaPrior::Float64
-  topics::Array{Float64,2}
-  topicSums::Vector{Float64}
-  documentSums::Array{Float64,2}
-  assignments::RaggedMatrix{Int64}
-  frozen::Bool
-  corpus::Corpus
-
-  Model(alphaPrior::Vector{Float64},
-        betaPrior::Float64,
-        V::Int64,
-        corpus::Corpus) = begin
-    K = length(alphaPrior)
-    m = new(
-      alphaPrior,
-      betaPrior,
-      zeros(Float64, K, V), # topics
-      zeros(Float64, K), # topicSums
-      zeros(Float64, K, length(corpus.documents)), #documentSums
-      Array{Array{Int64,1},1}(undef,length(corpus.documents)), # assignments
-      false,
-      corpus
-    )
-    initializeAssignments(m)
-    return m
-  end
-
-  Model(trainedModel::Model, corpus::Corpus) = begin
-    m = new(
-      trainedModel.alphaPrior,
-      trainedModel.betaPrior,
-      trainedModel.topics,
-      trainedModel.topicSums,
-      trainedModel.documentSums,
-      fill(Array(Int64, 0), length(corpus.documents)),
-      true,
-      corpus
-    )
-    initializeAssignments(m)
-    return m
-  end
-end
-
-function length(corpus::Corpus)
-  return length(corpus.documents)
-end
-
-function initializeAssignments(model::Model)
-  for dd in 1:length(model.corpus)
-    @inbounds words = model.corpus.documents[dd]
-    @inbounds model.assignments[dd] = fill(0, length(words))
-    for ww in 1:length(words)
-      @inbounds word = words[ww]
-      topic = sampleMultinomial(model.alphaPrior)
-      @inbounds model.assignments[dd][ww] = topic
-      updateSufficientStatistics(
-        word, topic, dd, model.corpus.weights[dd][ww], model)
-    end
-  end
-  return
-end
-
-function sampleMultinomial(p::Array{Float64,1})
-  pSum = sum(p)
-  r = rand() * pSum
-  K = length(p)
-  for k in 1:K
-    if r < p[k]
-      return k
-    else
-      r -= p[k]
-    end
-  end
-  return 0
-end
-
-function wordDistribution(word::Int,
-                          document::Int,
-                          model::Model,
-                          out::Vector{Float64})
-  V = size(model.topics, 2)
-  for ii in 1:length(out)
-    u = (model.documentSums[ii, document] + model.alphaPrior[ii]) *
-        (model.topics[ii, word] + model.betaPrior) /
-        (model.topicSums[ii] + V * model.betaPrior)
-    @inbounds out[ii] = u
-  end
-  return
-end
-
-function sampleWord(word::Int,
-                    document::Int,
-                    model::Model,
-                    p::Vector{Float64})
-  wordDistribution(word, document, model, p)
-  sampleMultinomial(p)
-end
-
-
-function updateSufficientStatistics(word::Int64,
-                                    topic::Int64,
-                                    document::Int64,
-                                    scale::Float64,
-                                    model::Model)
-  fr = Float64(!model.frozen)
-  @inbounds model.documentSums[topic, document] += scale
-  @inbounds model.topicSums[topic] += scale * fr
-  @inbounds model.topics[topic, word] += scale * fr
-  return
-end
-
-function sampleDocument(document::Int,
-                        model::Model)
-  @inbounds words = model.corpus.documents[document]
-  Nw = length(words)
-  @inbounds weights = model.corpus.weights[document]
-  K = length(model.alphaPrior)
-  p = Array{Float64,1}(undef,K)
-  @inbounds assignments = model.assignments[document]
-  for ii in 1:Nw
-    @inbounds word = words[ii]
-    @inbounds oldTopic = assignments[ii]
-    updateSufficientStatistics(word, oldTopic, document, -weights[ii], model)
-    newTopic = sampleWord(word, document, model, p)
-    @inbounds assignments[ii] = newTopic
-    updateSufficientStatistics(word, newTopic, document, weights[ii], model)
-  end
-  return
-end
-
-function sampleCorpus(model::Model)
-  for ii in 1:length(model.corpus)
-    sampleDocument(ii, model)
-  end
-  return
-end
-
-# Note, files are zero indexed, but we are 1-indexed.
-function termToWordSequence(term::AbstractString)
-  parts = split(term, ":")
-  fill(parse(Int64, parts[1]) + 1, parse(Int64, parts[2]))
-end
-
-# The functions below are designed for public consumption
-function trainModel(model::Model,
-                    numIterations::Int64)
-  for ii in 1:numIterations
-    println(string("Iteration ", ii, "..."))
-    sampleCorpus(model)
-  end
-  return
-end
-
-function topTopicWords(model::Model,
-                       lexicon::Array{String,1},
-                       numWords::Int64)
-  [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]]
-   for row in 1:size(model.topics,1)]
-end
-
-function readDocuments(stream)
-    lines = readlines(stream)
-    convert(RaggedMatrix{Int64},
-            [vcat([termToWordSequence(term) for term in split(line, " ")[2:end]]...)
-             for line in lines])
-end
-
-function readLexicon(stream)
-  lines = readlines(stream)
-  convert(Array{String,1},map(chomp, convert(Array{AbstractString,1}, lines)))
-end
+using Random, Distributions, Plots, UMAP
+using SpecialFunctions: loggamma
+using Clustering: randindex
 
+#Exports
 export Corpus,
+       LdaCorpus,
        Model,
-       readDocuments,
+       State,
+       readDocs,
        readLexicon,
+       termToWordSequence,
        topTopicWords,
-       trainModel
-end
+       trainModel,
+       CorpusTopics,
+       CorpusARI,
+       DocsARI
+
+#Data that we make or find in real life:
+include("Data.jl")
+
+#Bayesian learning and inference:
+include("Computation.jl")
+
+#Stuff like perplexity and ARI:
+include("Validation.jl")
+end #module
\ No newline at end of file
diff --git a/src/Validation.jl b/src/Validation.jl
new file mode 100644
index 0000000..57710e5
--- /dev/null
+++ b/src/Validation.jl
@@ -0,0 +1,15 @@
+function CorpusARI(state::State,model::Model,corpus::LdaCorpus)
+  #for synthetic data, turn our mixed membership document vectors into max likelihood assignments
+  # and check ARI between the ground truth and the state
+
+  learned_max_clust = map(i->i[1], findmax(getTopicDist(state,model),dims=1)[2])
+  true_max_clust = map(i->i[1], findmax(CorpusTopics(corpus),dims=1)[2])
+  randindex(learned_max_clust,true_max_clust)
+end
+
+function DocsARI(state::State,corpus::LdaCorpus)
+  #for synthetic data, find the topic ARI across [all terms in] all documents in the corpus
+  learned_clust = AllTopics(state)
+  true_clust = AllTopics(corpus)
+  randindex(learned_clust,true_clust)
+end
diff --git a/test/Gibbs_unit_tests.jl b/test/Gibbs_unit_tests.jl
new file mode 100644
index 0000000..fda7937
--- /dev/null
+++ b/test/Gibbs_unit_tests.jl
@@ -0,0 +1,54 @@
+using Test, TopicModels, Random
+using TopicModels: updateSufficientStatistics, joint_log_p #non-exported fns we need
+
+
+# use the equality of likelihood ratio to test that the conditional distribution is consistent with the joint distribution
+@testset "LDA docs" begin
+    # generate some data from LDA where the doclength is Poisson
+    k = 7
+    lexLength = 10
+    corpLambda = 10 # poisson parameter for random doc length
+    corpLength = 10
+    scaleK = 0.1
+    scaleL = 0.1
+    Random.seed!(123)
+
+    corpus = LdaCorpus(k, lexLength, corpLambda, corpLength, scaleK, scaleL)
+
+    model = Model(corpus.alpha, corpus.beta, corpus)
+    state = State(model, corpus)
+    trainModel(model, state, 10) # update all the state variables
+
+    # pick a random doc/word to iterate the sampler
+    doc_ind = rand(1:corpLength)
+    word_ind = rand(1:length(corpus.docs[doc_ind]))
+    word = corpus.docs[doc_ind].terms[word_ind]
+
+    conditional = state.conditionals[doc_ind][word_ind,:]
+    oldTopic = copy(state.assignments[doc_ind][word_ind])  # the original word token
+
+    newTopic = rand(collect(1:k)[1:end .!= oldTopic],1) # a different word token
+    newTopic = Int64(newTopic[1])
+
+    #get the original state probs
+    joint_Lw = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for <doc><word>
+    cond_Lw = log(state.conditionals[doc_ind][word_ind,oldTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...)
+    cond_Lw_new = log(state.conditionals[doc_ind][word_ind,newTopic]/sum(state.conditionals[doc_ind][word_ind,:])) # log conditional p(z=k|...)
+
+    updateSufficientStatistics(word, oldTopic, doc_ind, -model.corpus.weights[doc_ind][word_ind], state) #remove counts for the old topic
+    updateSufficientStatistics(word, newTopic, doc_ind, model.corpus.weights[doc_ind][word_ind], state) #update stats for new topic
+    joint_Lw_new = copy(joint_log_p(model,state)) # log prob of the full joint under original topic for <doc><word>
+
+    print("joint_Lw: ", joint_Lw, "\n")
+    print("cond_Lw: ", cond_Lw, "\n")
+
+    print("joint_Lw_new: ", joint_Lw_new, "\n")
+    print("cond_Lw_new: ", cond_Lw_new, "\n")
+
+    print("joint_LR: ", joint_Lw_new-joint_Lw, "\n")
+    print("cond_LR: ", cond_Lw_new-cond_Lw, "\n")
+    print("old Topic: ", oldTopic, "\n")
+    print("new Topic: ", newTopic, "\n")
+
+    @test isless(abs(joint_Lw_new-joint_Lw - cond_Lw_new+cond_Lw),1e-5)
+end

<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head>
<title>pFad - Phonifier reborn</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
<h1>Pfad - The Proxy pFad of &#169; 2024 Garber Painting. All rights reserved.</h1>


<!-- Disclaimer -->
<p>Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.</p>
<br>
<p>Alternative Proxies:</p><p><a href="http://rainy.clevelandohioweatherforecast.com/php-proxy/index.php?q=https://patch-diff.githubusercontent.com/raw/slycoder/TopicModels.jl/pull/9.diff" target="_blank">Alternative Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/index.php?u=https://patch-diff.githubusercontent.com/raw/slycoder/TopicModels.jl/pull/9.diff" target="_blank">pFad Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v3index.php?u=https://patch-diff.githubusercontent.com/raw/slycoder/TopicModels.jl/pull/9.diff" target="_blank">pFad v3 Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v4index.php?u=https://patch-diff.githubusercontent.com/raw/slycoder/TopicModels.jl/pull/9.diff" target="_blank">pFad v4 Proxy</a></p></body>
</html>