forked from unitedstates/congress-legislators
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipedia_ids.py
executable file
·147 lines (117 loc) · 5.32 KB
/
wikipedia_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Scans Wikipedia for pages using the CongBio and CongLinks
# templates, which have Bioguide IDs. Updates the 'wikipedia'
# ID field for matching Members of Congress, and for pages
# using the CongLinks template also updates a variety of
# other ID as found in the template.
import lxml.etree, re, urllib.request, urllib.parse, urllib.error
import utils, os.path
def run():
# Field mapping. And which fields should be turned into integers.
# See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available.
fieldmap = {
"congbio": "bioguide",
#"fec": "fec", # handled specially...
"govtrack": "govtrack", # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
"opensecrets": "opensecrets",
"votesmart": "votesmart",
"cspan": "cspan",
}
int_fields = ("govtrack", "votesmart", "cspan")
# default to not caching
cache = utils.flags().get('cache', False)
# Load legislator files and map bioguide IDs.
y1 = utils.load_data("legislators-current.yaml")
y2 = utils.load_data("legislators-historical.yaml")
bioguides = { }
for y in y1+y2:
bioguides[y["id"]["bioguide"]] = y
# Okay now the Wikipedia stuff...
def get_matching_pages():
# Does a Wikipedia API search for pages containing either of the
# two templates. Returns the pages.
page_titles = set()
for template in ("CongLinks", "CongBio"):
eicontinue = ""
while True:
# construct query URL, using the "eicontinue" of the last query to get the next batch
url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
if eicontinue: url += "&eicontinue=" + eicontinue
# load the XML
print("Getting %s pages (%d...)" % (template, len(page_titles)))
dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably
for pgname in dom.xpath("query/embeddedin/ei/@title"):
page_titles.add(pgname)
# get the next eicontinue value and loop
eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)")
if not eicontinue: break
return page_titles
# Get the list of Wikipedia pages that use any of the templates we care about.
page_list_cache_file = os.path.join(utils.cache_dir(), "legislators/wikipedia/page_titles")
if cache and os.path.exists(page_list_cache_file):
# Load from cache.
matching_pages = open(page_list_cache_file).read().split("\n")
else:
# Query Wikipedia API and save to cache.
matching_pages = get_matching_pages()
utils.write(("\n".join(matching_pages)), page_list_cache_file)
# Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
matching_pages = [p for p in matching_pages if ":" not in p]
# Load each page's content and parse the template.
for p in sorted(matching_pages):
if " campaign" in p: continue
if " (surname)" in p: continue
if "career of " in p: continue
if "for Congress" in p: continue
if p.startswith("List of "): continue
if p in ("New York in the American Civil War", "Upper Marlboro, Maryland"): continue
# Query the Wikipedia API to get the raw page content in XML,
# and then use XPath to get the raw page text.
url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(p.encode("utf8")) + "&export&exportnowrap"
cache_path = "legislators/wikipedia/pages/" + p
dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache))
page_content = dom.xpath("string(mw:page/mw:revision/mw:text)", namespaces={ "mw": "http://www.mediawiki.org/xml/export-0.8/" })
# Build a dict for the IDs that we want to insert into our files.
new_ids = {
"wikipedia": p # Wikipedia page name, with spaces for spaces (not underscores)
}
if "CongLinks" in page_content:
# Parse the key/val pairs in the template.
m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content)
if not m: continue # no template?
for arg in m.group(1).split("|"):
if "=" not in arg: continue
key, val = arg.split("=", 1)
key = key.strip()
val = val.strip()
if val and key in fieldmap:
try:
if fieldmap[key] in int_fields: val = int(val)
except ValueError:
print("invalid value", key, val)
continue
if key == "opensecrets": val = val.replace("&newMem=Y", "").replace("&newmem=Y", "").replace("&cycle=2004", "").upper()
new_ids[fieldmap[key]] = val
if "bioguide" not in new_ids: continue
new_ids["bioguide"] = new_ids["bioguide"].upper() # hmm
bioguide = new_ids["bioguide"]
else:
m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content)
if not m: continue # no template?
bioguide = m.group(1).upper()
if not bioguide in bioguides:
print("Member not found: " + bioguide, p.encode("utf8"), "(Might have been a delegate to the Constitutional Convention.)")
continue
# handle FEC ids specially because they are stored in an array...
fec_id = new_ids.get("fec")
if fec_id: del new_ids["fec"]
member = bioguides[bioguide]
member["id"].update(new_ids)
# ...finish the FEC id.
if fec_id:
if fec_id not in bioguides[bioguide]["id"].get("fec", []):
bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id)
#print p.encode("utf8"), new_ids
utils.save_data(y1, "legislators-current.yaml")
utils.save_data(y2, "legislators-historical.yaml")
if __name__ == '__main__':
run()