Conduct Legal Research with AI: Part 0
Crawling the Library of Congress API
Introduction
The United States Library of Congress maintains a rest api for developers to crawl their collections. It is an open source tool that anyone can access in order to conduct research. Check out the documenation at https://libraryofcongress.github.io/data-exploration/.
Creating a crawler
I took the approach of writing a generator that produces a search result page object that can be operated upon with each iteration.
The first step is to create a search result page object.
The code below documents the search_result page. It contains a number of helper functions to convert the nodes within the result to json or graphml. The code can also be used to generate a networkx in memory graph.
The Search Result Object
#pprint(num_columns)
=
=
=
#self.soup_html = self.html_parse()
=
=
= +
= +
=
return
= +
= +
=
= +
= +
= +
= +
#root = root
#node_list.append((k, {'type' : k}))
#(1, 2, color='red', weight=0.84, size=300)\
#edge_list.append((root , k, {"relationship" : "of"}, {"type" : 'root'}))
#pprint('passing_value')
#save k
=
=
#pprint('passing_data')
#create_edge to k
#this item is no longer a dictionary or list
#create edge to k
#node_list.append((data, {"type" : data}))
#edge_list.append((previous_k ,data,{'relationship': "is"}, {'type' : data}))
#edge_list.append((root,data))
#flatten(hierarchak)_dict)
return
=
=
#root = item['title']
=
#pprint(edge_list)
return
#pprint(data)
#pprint('passing_value')
#save k
=
=
#pprint('passing_data')
#create_edge to k
#this item is no longer a dictionary or list
#create edge to k
#flatten(hierarchak)_dict)
return ,
#self.json_graph = self.create_json_graph()
#graph = nx.Graph(self.response_json)
=
#graph = json_graph.node_link_graph(self.response_json)
return
#self.node_list = self.node_generator`
=
=
+= 1
continue
#rename to _data_list
=
return
#data_list.append(request_body_tmp)
= + + +
return
=
#pprint("conlum_string")
#pprint(num_columns)
, =
= +
#pprint(string)
return
#print('first_map_columns_print')
#num_columns_tmp = self.num_columns
#pprint(num_columns_tmp)
=
#print('second_map_Columns_print')
#pprint(num_columns_tmp)
=
= + 1
#append range to request...
#append collumn to batch lookup
continue
=
=
=
#pprint(request_body)
return
#return column_lookup_table
return
=
= +
=
= +
=
#pprint(search_url)
return
=
return
return
=
#pprint(soup)
return
=
=
return
The Generator
The generator yields a search result page if the pagination link included in the response is valid.
Depending on your use case you could pass the page_num and collection you would like to crawl.
The api is limited to 80 results per minute. The sleep function limits calls to the desired rate. The amount of results returned can be passed to the search result page_object. Review the code above to see which values can be passed.
#column_lookup_table = {}
#pprint(num_columns)
= 1
=
#pprint(num_columns)
=
= True
= + 1
yield
= False
yield
The Runner Function
To initiate the crawl simply run the algorithm below. It writes each result page to json.
As a follow up project, I will post how to integrate the data returned into a neo4j database.
=
#print('hahaha')
#obj.write_graphml(file_num= page_num)
#obj.to_pandas()
#obj.write_to_file(data = obj.dict_of_dicts, file_num = page_num)
#obj.to_csv()
Putting Everything Together
The code below is the entire program as it stands. There is built in functionality to upload the results to a google sheet if that is what you desire using an extension of the google api. That code can be found at https://github.com/justin-napolitano/GoogleAPI
#library_of_congress_scraper.py
#import load_vars as lv
#from ratelimiter import RateLimiter
"""Context manager for changing the current working directory"""
=
=
#pprint(num_columns)
=
=
=
#self.soup_html = self.html_parse()
=
=
= +
= +
=
return
= +
= +
=
= +
= +
= +
= +
#root = root
#node_list.append((k, {'type' : k}))
#(1, 2, color='red', weight=0.84, size=300)\
#edge_list.append((root , k, {"relationship" : "of"}, {"type" : 'root'}))
#pprint('passing_value')
#save k
=
=
#pprint('passing_data')
#create_edge to k
#this item is no longer a dictionary or list
#create edge to k
#node_list.append((data, {"type" : data}))
#edge_list.append((previous_k ,data,{'relationship': "is"}, {'type' : data}))
#edge_list.append((root,data))
#flatten(hierarchak)_dict)
return
=
=
#root = item['title']
=
#pprint(edge_list)
return
#pprint(data)
#pprint('passing_value')
#save k
=
=
#pprint('passing_data')
#create_edge to k
#this item is no longer a dictionary or list
#create edge to k
#flatten(hierarchak)_dict)
return ,
#self.json_graph = self.create_json_graph()
#graph = nx.Graph(self.response_json)
=
#graph = json_graph.node_link_graph(self.response_json)
return
#self.node_list = self.node_generator`
=
=
+= 1
continue
#rename to _data_list
=
return
#data_list.append(request_body_tmp)
= + + +
return
=
#pprint("conlum_string")
#pprint(num_columns)
, =
= +
#pprint(string)
return
#print('first_map_columns_print')
#num_columns_tmp = self.num_columns
#pprint(num_columns_tmp)
=
#print('second_map_Columns_print')
#pprint(num_columns_tmp)
=
= + 1
#append range to request...
#append collumn to batch lookup
continue
=
=
=
#pprint(request_body)
return
#return column_lookup_table
return
=
= +
=
= +
=
#pprint(search_url)
return
=
return
return
=
#pprint(soup)
return
=
=
return
=
=
=
=
=
=
=
return
= + + +
return
=
, =
= +
return
=
"""Shows basic usage of the Drive v3 API.
Prints the names and ids of the first 10 files the user has access to.
"""
=
#creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
=
# Call the Drive v3 API
=
=
return
=
=
=
=
=
=
#print(res)
return
=
=
return
=
= None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
=
# If there are no (valid) credentials available, let the user log in.
=
#creds = ServiceAccountCredentials.from_json_keyfile_name('add_json_file_here.json', SCOPES)
#flow = InstalledAppFlow.from_client_secrets_file(
# 'credentials.json', SCOPES)
#creds = flow.run_local_server(port=0)
# Save the credentials for the next run
#with open('token.json', 'w') as token:
# token.write(creds.to_json())
return
#self.yaml_stream = file("config.yaml", 'r')
=
#print("test")
=
=
#pprint(data)
return
=
return
=
return
#column_lookup_table = {}
#pprint(num_columns)
= 1
=
#pprint(num_columns)
=
= True
= + 1
yield
= False
yield
#search = search_results(base_url,collection,json_parameter,results_per_page,query_param,page_param,page_num)
#pprint(search.search_url)
#pprint(num_columns)
return
=
return
=
return
=
return
=
return
#rate_limiter = RateLimiter(max_calls=1, period=60)
#cd to output
#result = create_search_results_page_object()
#with cd("output"):
# result.write_to_file(data = result.dict_of_dicts, file_num = 1)
=
#print('hahaha')
#obj.write_graphml(file_num= page_num)
#obj.to_pandas()
#obj.write_to_file(data = obj.dict_of_dicts, file_num = page_num)
#obj.to_csv()
=
=