forked from unitedstates/congress-legislators
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paththomas_ids.py
executable file
·85 lines (67 loc) · 4.11 KB
/
thomas_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python
# Update current THOMAS IDs using beta.congress.gov. Congressmen's
# IDs are updated directly. For Senators, we just print out new
# IDs because name matching is hard.
import lxml.html, io, urllib.request, urllib.parse, urllib.error
import re
import utils
from utils import download, load_data, save_data
def run():
CONGRESS_ID = "113th Congress (2013-2014)" # the query string parameter
# constants
state_names = {"Alabama": "AL", "Alaska": "AK", "American Samoa": "AS", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "District of Columbia": "DC", "Florida": "FL", "Georgia": "GA", "Guam": "GU", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Northern Mariana Islands": "MP", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Puerto Rico": "PR", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virgin Islands": "VI", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY"}
# default to not caching
cache = utils.flags().get('cache', False)
force = not cache
# load in current members
y = load_data("legislators-current.yaml")
by_district = { }
existing_senator_ids = set()
for m in y:
last_term = m['terms'][-1]
if last_term['type'] == 'rep':
full_district = "%s%02d" % (last_term['state'], int(last_term['district']))
by_district[full_district] = m
elif last_term['type'] == 'sen':
if "thomas" in m["id"]:
existing_senator_ids.add(m["id"]["thomas"])
seen_ids = set()
for chamber in ("House of Representatives", "Senate"):
url = "http://beta.congress.gov/members?pageSize=500&Legislative_Source=Member+Profiles&Congress=%s&Chamber_of_Congress=%s" % (
urllib.parse.quote_plus(CONGRESS_ID), urllib.parse.quote_plus(chamber))
cache = "congress.gov/members/%s-%s.html" % (CONGRESS_ID, chamber)
try:
body = download(url, cache, force)
dom = lxml.html.parse(io.StringIO(body)).getroot()
except lxml.etree.XMLSyntaxError:
print("Error parsing: ", url)
continue
for node in dom.xpath("//ul[@class='results_list']/li"):
thomas_id = "%05d" % int(re.search("/member/.*/(\d+)$", node.xpath('h2/a')[0].get('href')).group(1))
# THOMAS misassigned these 'new' IDs to existing individuals.
if thomas_id in ('02139', '02132'):
continue
name = node.xpath('h2/a')[0].text
state = node.xpath('div[@class="memberProfile"]/table/tbody/tr[1]/td')[0].text.strip()
state = state_names[state]
if chamber == "House of Representatives":
# There's enough information to easily pick out which Member this refers to, so write it
# directly to the file.
district = node.xpath('div[@class="memberProfile"]/table/tbody/tr[2]/td')[0].text.strip()
if district == "At Large": district = 0
district = "%02d" % int(district)
if state + district not in by_district:
print(state + district + "'s", name, "appears on Congress.gov but the office is vacant in our data.")
continue
if state + district in seen_ids:
print("Congress.gov lists two people for %s%s!" % (state, district))
seen_ids.add(state+district)
by_district[state + district]["id"]["thomas"] = thomas_id
elif chamber == "Senate":
# For senators we'd have to match on name or something else, so that's too difficult.
# Just look for new IDs.
if thomas_id not in existing_senator_ids:
print("Please manually set", thomas_id, "for", name, "from", state)
save_data(y, "legislators-current.yaml")
if __name__ == '__main__':
run()