-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgoogle_api_query.py
More file actions
89 lines (73 loc) · 3.22 KB
/
google_api_query.py
File metadata and controls
89 lines (73 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from googleapiclient.discovery import build
import pprint
from pathlib import Path
import json
api_key = 'xxx'
cse_id = 'xxx'
def google_search(service, query_keywords, api_key, cse_id):
res = service.cse().list(q=query_keywords, cx=cse_id).execute()
return res
def google_next_page(service, query_keywords, api_key, cse_id, res, page, max_page, url_items):
next_res = service.cse().list(q=query_keywords, cx=cse_id, num=10, start=res['queries']['nextPage'][0]['startIndex'],).execute()
for item in next_res['items']:
url_items.append(item)
page += 1
if page == max_page:
return url_items
return google_next_page(service, query_keywords, api_key, cse_id, next_res, page, max_page, url_items)
if __name__ == "__main__":
service = build("customsearch","v1", developerKey=api_key)
# ====== create the dictionary object to save parameters for scrapy ======
settings_json = {'name':[], 'allowed_domains':[], 'start_urls':[]}
# create the url and domain list
urls_list = []
urls_pdf_list = []
domains_list = []
# define the key words to query
key_words = 'xxx'
# query the first page
result = google_search(service = service, query_keywords=key_words, api_key = api_key, cse_id = cse_id)
# append the result from first page
url_items = result['items']
# pprint.pprint(result)
# go through the top three pages only
# define the pages to scrap
max_page = 3
url_items = google_next_page(service = service, query_keywords=key_words, api_key = api_key, cse_id = cse_id, res = result, page=0, max_page = max_page, url_items = url_items)
# loop the pages
# pprint.pprint(url_items)
for items in url_items:
# pprint.pprint(item)
# single item is a dictionary object
# check whether it is the pdf format or not
try:
# pprint.pprint(item)
if 'fileFormat' in items.keys():
# separate the urls for pdf, then go to pdf parser module than scrapy part
urls_pdf_list.append(items['link'])
else:
urls_list.append(items['link'])
domains_list.append(items['displayLink'])
except Exception as e:
print(e)
continue
# pass the values to settings_json
settings_json['name'] = key_words
settings_json['allowed_domains'] = domains_list
settings_json['start_urls'] = urls_list
# get the current path
current_path = Path(__file__).parent.absolute()
# define the output folder for normal links
result_output = current_path/'output'/'urls'
# define the output folder for pdf links
result_pdf_output = current_path/'output'/'pdf_urls'
# create new folders if the folder does not exist
for output_folder in [result_output, result_pdf_output]:
if not output_folder.is_dir():
output_folder.mkdir (parents=True, exist_ok=True)
# save the pdf result and the settings_json result
with Path(result_output/'query_result.json').open('w') as fw:
json.dump(settings_json, fw)
with Path(result_pdf_output/'query_pdf_result.txt').open('w') as pw:
for url in urls_pdf_list:
pw.write(url+'\n')