From ab2f8345aa8654fb13b49b89e251f6751c0827a6 Mon Sep 17 00:00:00 2001 From: jmcurran Date: Sun, 10 Dec 2023 14:50:17 +1300 Subject: [PATCH 1/5] Trying to add flexibility of file types downloaded --- Reddit_image_scraper.py | 7 +++++-- config.ini | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py index ad48798..023c5f2 100644 --- a/Reddit_image_scraper.py +++ b/Reddit_image_scraper.py @@ -205,7 +205,9 @@ def get_client_info(): if not os.path.exists('config.ini'): with open('config.ini', 'w') as f: log('config.ini template created. Please paste in your client secret. (And RTM)') - f.write("""[ALPHA] + f.write("""[DEFAULT] +ext='.webm', '.gif', '.avi', '.mp4', '.jpg', '.png', '.mov', '.ogg', '.wmv', '.mp2', '.mp3', '.mkv' +[ALPHA] client_id=PASTE ID HERE client_secret=PASTE SECRET HERE query_limit=2000 @@ -220,7 +222,8 @@ def get_client_info(): ratelimit_sleep = config["ALPHA"]["ratelimit_sleep"] failure_sleep = config["ALPHA"]["failure_sleep"] minimum_file_size_kb = config["ALPHA"]["minimum_file_size_kb"] - return id, secret, int(query_limit), int(ratelimit_sleep), int(failure_sleep), float(minimum_file_size_kb) + ext=config["ALPHA"]["ext"] + return id, secret, int(query_limit), int(ratelimit_sleep), int(failure_sleep), float(minimum_file_size_kb), ext def is_media_file(uri): diff --git a/config.ini b/config.ini index b5cf5ee..be00674 100644 --- a/config.ini +++ b/config.ini @@ -1,3 +1,5 @@ +[DEFAULT] +ext='.webm', '.gif', '.avi', '.mp4', '.jpg', '.png', '.mov', '.ogg', '.wmv', '.mp2', '.mp3', '.mkv' [ALPHA] client_id= client_secret= From 938dfbbd05b452eed32ea10fed4a05c438df8738 Mon Sep 17 00:00:00 2001 From: jmcurran Date: Sun, 10 Dec 2023 17:10:59 +1300 Subject: [PATCH 2/5] ext --- Reddit_image_scraper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py index 023c5f2..d1462c0 100644 --- a/Reddit_image_scraper.py +++ b/Reddit_image_scraper.py @@ -460,8 +460,8 @@ def read_img_links(submissions, url_list, user_submissions): if __name__ == '__main__': # Get client info - ClientInfo.id, ClientInfo.secret, query_lookup_limit, ratelimit_sleep, failure_sleep, minimum_file_size_kb = get_client_info() - + ClientInfo.id, ClientInfo.secret, query_lookup_limit, ratelimit_sleep, failure_sleep, minimum_file_size_kb, ext = get_client_info() + print(ext) # Create project directories create_directories() From 06bfbdf2301a8a831748a1b717b69d3c0e59d891 Mon Sep 17 00:00:00 2001 From: jmcurran Date: Sun, 10 Dec 2023 17:16:30 +1300 Subject: [PATCH 3/5] split mask --- Reddit_image_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py index d1462c0..5b92e04 100644 --- a/Reddit_image_scraper.py +++ b/Reddit_image_scraper.py @@ -223,7 +223,7 @@ def get_client_info(): failure_sleep = config["ALPHA"]["failure_sleep"] minimum_file_size_kb = config["ALPHA"]["minimum_file_size_kb"] ext=config["ALPHA"]["ext"] - return id, secret, int(query_limit), int(ratelimit_sleep), int(failure_sleep), float(minimum_file_size_kb), ext + return id, secret, int(query_limit), int(ratelimit_sleep), int(failure_sleep), float(minimum_file_size_kb), ext.split(',') def is_media_file(uri): From b5978c1d4e6640f8f495dd546af63acf1ab1c11e Mon Sep 17 00:00:00 2001 From: jmcurran Date: Sun, 10 Dec 2023 17:19:05 +1300 Subject: [PATCH 4/5] fixing extensions --- Reddit_image_scraper.py | 2 +- config.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py index 5b92e04..04dedcc 100644 --- a/Reddit_image_scraper.py +++ b/Reddit_image_scraper.py @@ -206,7 +206,7 @@ def get_client_info(): with open('config.ini', 'w') as f: log('config.ini template created. Please paste in your client secret. (And RTM)') f.write("""[DEFAULT] -ext='.webm', '.gif', '.avi', '.mp4', '.jpg', '.png', '.mov', '.ogg', '.wmv', '.mp2', '.mp3', '.mkv' +ext=.webm, .gif, .avi, .mp4, .jpg, .png', .mov, .ogg, .wmv, .mp2, .mp3, .mkv' [ALPHA] client_id=PASTE ID HERE client_secret=PASTE SECRET HERE diff --git a/config.ini b/config.ini index be00674..9173795 100644 --- a/config.ini +++ b/config.ini @@ -1,5 +1,5 @@ [DEFAULT] -ext='.webm', '.gif', '.avi', '.mp4', '.jpg', '.png', '.mov', '.ogg', '.wmv', '.mp2', '.mp3', '.mkv' +ext=.webm, .gif, .avi, .mp4, .jpg, .png, .mov, .ogg, .wmv, .mp2, .mp3, .mkv [ALPHA] client_id= client_secret= From da106c2c62d9f7ba42b623e2e1f1396ec8a867dd Mon Sep 17 00:00:00 2001 From: jmcurran Date: Sun, 10 Dec 2023 17:22:55 +1300 Subject: [PATCH 5/5] Next step --- Reddit_image_scraper.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Reddit_image_scraper.py b/Reddit_image_scraper.py index 04dedcc..a3158d7 100644 --- a/Reddit_image_scraper.py +++ b/Reddit_image_scraper.py @@ -226,7 +226,7 @@ def get_client_info(): return id, secret, int(query_limit), int(ratelimit_sleep), int(failure_sleep), float(minimum_file_size_kb), ext.split(',') -def is_media_file(uri): +def is_media_file(uri, extensions): # print('Original Link:' + img_link) # enable this if you want to log the literal URLs it sees regex = '([.][\w]+)$' re.compile(regex) @@ -236,7 +236,7 @@ def is_media_file(uri): ext = uri[-4:] if t: ext = t.group() - if ext in ('.webm', '.gif', '.avi', '.mp4', '.jpg', '.png', '.mov', '.ogg', '.wmv', '.mp2', '.mp3', '.mkv'): + if ext in extensions: return True else: return False @@ -403,7 +403,7 @@ def download_img(img_url, img_title, file_loc, sub, ratelimit_sleep: int, failur return 1 -def read_img_links(submissions, url_list, user_submissions): +def read_img_links(submissions, url_list, user_submissions, extensions): sub = submissions.lower() if user_submissions: if not os.path.exists('./users/{}'.format(sub)): @@ -429,7 +429,7 @@ def read_img_links(submissions, url_list, user_submissions): # print(link[-4:]) # print('gfycat found:{}'.format(link)) link = link + '.gif' - if not is_media_file(link): + if not is_media_file(link, extensions): continue file_name = link.split('/')[-1] @@ -460,8 +460,8 @@ def read_img_links(submissions, url_list, user_submissions): if __name__ == '__main__': # Get client info - ClientInfo.id, ClientInfo.secret, query_lookup_limit, ratelimit_sleep, failure_sleep, minimum_file_size_kb, ext = get_client_info() - print(ext) + ClientInfo.id, ClientInfo.secret, query_lookup_limit, ratelimit_sleep, failure_sleep, minimum_file_size_kb, extensions = get_client_info() + print(extensions) # Create project directories create_directories() @@ -495,7 +495,7 @@ def read_img_links(submissions, url_list, user_submissions): if url_list: try: log('{} images found on {}'.format(len(url_list), redditor)) - count, status, already_here = read_img_links(redditor, url_list, True) + count, status, already_here = read_img_links(redditor, url_list, True, extensions) if status == 1: log( @@ -528,7 +528,7 @@ def read_img_links(submissions, url_list, user_submissions): if url_list: try: log('{} images found on {}'.format(len(url_list), subreddit)) - count, status, already_here = read_img_links(subreddit, url_list, 0) + count, status, already_here = read_img_links(subreddit, url_list, 0, extensions) if status == 1: log(