scripts/wikipedia_ids.py

# Scans Wikipedia for pages using the CongBio and CongLinks
# templates, which have Bioguide IDs. Updates the 'wikipedia'
# ID field for matching Members of Congress, and for pages
# using the CongLinks template also updates a variety of
# other ID as found in the template.

import lxml.etree, re, urllib.request, urllib.parse, urllib.error
import utils, os.path

def run():

	# Field mapping. And which fields should be turned into integers.
	# See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available.
	fieldmap = {
		"congbio": "bioguide",
		#"fec": "fec", # handled specially...
		"govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
		"opensecrets": "opensecrets",
		"votesmart": "votesmart",
		"cspan": "cspan",
	}
	int_fields = ("govtrack", "votesmart", "cspan")

	# default to not caching
	cache = utils.flags().get('cache', False)

	# Load legislator files and map bioguide IDs.
	y1 = utils.load_data("legislators-current.yaml")
	y2 = utils.load_data("legislators-historical.yaml")
	bioguides = { }
	for y in y1+y2:
	  bioguides[y["id"]["bioguide"]] = y

	# Okay now the Wikipedia stuff...

	def get_matching_pages():
		# Does a Wikipedia API search for pages containing either of the
		# two templates. Returns the pages.

		page_titles = set()

		for template in ("CongLinks", "CongBio"):
			eicontinue = ""
			while True:
				# construct query URL, using the "eicontinue" of the last query to get the next batch
				url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
				if eicontinue: url += "&eicontinue=" + eicontinue

				# load the XML
				print("Getting %s pages (%d...)" % (template, len(page_titles)))
				dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably

				for pgname in dom.xpath("query/embeddedin/ei/@title"):
					page_titles.add(pgname)

				# get the next eicontinue value and loop
				eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)")
				if not eicontinue: break

		return page_titles

	# Get the list of Wikipedia pages that use any of the templates we care about.
	page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles")
	if cache and os.path.exists(page_list_cache_file):
		# Load from cache.
		matching_pages = open(page_list_cache_file).read().split("\n")
	else:
		# Query Wikipedia API and save to cache.
		matching_pages = get_matching_pages()
		utils.write(("\n".join(matching_pages)), page_list_cache_file)

	# Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
	matching_pages = [p for p in matching_pages if ":" not in p]

	# Load each page's content and parse the template.
	for p in sorted(matching_pages):
		if " campaign" in p: continue
		if " (surname)" in p: continue
		if "career of " in p: continue
		if "for Congress" in p: continue
		if p.startswith("List of "): continue
		if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue

		# Query the Wikipedia API to get the raw page content in XML,
		# and then use XPath to get the raw page text.
		url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(p.encode("utf8")) + "&export&exportnowrap"
		cache_path = "legislators/wikipedia/pages/" + p
		dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache))
		page_content = dom.xpath("string(mw:page/mw:revision/mw:text)", namespaces={ "mw": "http://www.mediawiki.org/xml/export-0.8/" })

		# Build a dict for the IDs that we want to insert into our files.
		new_ids = {
			"wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores)
		}

		if "CongLinks" in page_content:
			# Parse the key/val pairs in the template.
			m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content)
			if not m: continue # no template?
			for arg in m.group(1).split("|"):
				if "=" not in arg: continue
				key, val = arg.split("=", 1)
				key = key.strip()
				val = val.strip()
				if val and key in fieldmap:
					try:
						if fieldmap[key] in int_fields: val = int(val)
					except ValueError:
						print("invalid value", key, val)
						continue

					if key == "opensecrets": val = val.replace("&newMem=Y", "").replace("&newmem=Y", "").replace("&cycle=2004", "").upper()
					new_ids[fieldmap[key]] = val

			if "bioguide" not in new_ids: continue
			new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm
			bioguide = new_ids["bioguide"]

		else:
			m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content)
			if not m: continue # no template?
			bioguide = m.group(1).upper()


		if not bioguide in bioguides:
			print("Member not found: " + bioguide, p.encode("utf8"), "(Might have been a delegate to the Constitutional Convention.)")
			continue

		# handle FEC ids specially because they are stored in an array...
		fec_id = new_ids.get("fec")
		if fec_id: del new_ids["fec"]

		member = bioguides[bioguide]
		member["id"].update(new_ids)

		# ...finish the FEC id.
		if fec_id:
			if fec_id not in bioguides[bioguide]["id"].get("fec", []):
				bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id)

		#print p.encode("utf8"), new_ids

	utils.save_data(y1, "legislators-current.yaml")
	utils.save_data(y2, "legislators-historical.yaml")

if __name__ == '__main__':
  run()