diff --git a/config/config_template.yaml b/config/config_template.yaml index c53d1fe..c414ac6 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -14,6 +14,7 @@ requests: input: input_dir: ../input input_files: + skip_domains: urls: urls.txt keywords: keywords.txt url_max: 100 diff --git a/src/crawl/HesitantCrawler.py b/src/crawl/HesitantCrawler.py index 74a9de4..2703494 100644 --- a/src/crawl/HesitantCrawler.py +++ b/src/crawl/HesitantCrawler.py @@ -21,7 +21,8 @@ def __init__( fetcher: HTMLFetcher, target_keywords: List[str], add_sitemapurls: bool = False, - max_depth: int = 1): + max_depth: int = 1, + skip_domains: List[str] = []): """ Depth-limited Search Targeted Crawler Crawler class for obtaining urls from start_url. @@ -64,6 +65,10 @@ def __init__( self.target_keywords = target_keywords logging.info(f"The targeted crawl will look for given keywords: {', '.join(self.target_keywords)}") + # Skip domains + self.skip_domains = skip_domains + logging.info(f"The targeted crawl will skip domains: {', '.join(self.skip_domains)}") + # Excluded URLs which contain: self._unsupported = ( ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps", @@ -91,6 +96,13 @@ def skip_this_url(self, url: str) -> bool: # prevent duplicate crawl from trailing forward slash in URL url = url.rstrip('/') if url.endswith('/') else url + # prevent duplicate crawl from '#' such as '#content', '#main', etc. + url = url.rstrip("#") if url.contains("#") else url + + if any([skip_domain in url for skip_domain in self.skip_domains]): + logging.debug(f"Skip {url}, because domain is in skip-list") + return True # skip + # Do not revisit pages if url in self._visited: logging.debug(f"Skip {url}, because we have visited it before") diff --git a/src/crawl/__init__.py b/src/crawl/__init__.py index 77999db..6fe4b71 100644 --- a/src/crawl/__init__.py +++ b/src/crawl/__init__.py @@ -1,2 +1,3 @@ from .base import ICrawler, NoCrawler, BaseCrawler, CrawlResult -from .HesitantCrawler import HesitantCrawler \ No newline at end of file +from .HesitantCrawler import HesitantCrawler +from .scrapymodules import ScrapyResult \ No newline at end of file diff --git a/src/crawl/base.py b/src/crawl/base.py index 36b51ca..1a9f576 100644 --- a/src/crawl/base.py +++ b/src/crawl/base.py @@ -2,7 +2,7 @@ from typing import NamedTuple, List import logging from urllib.parse import urlparse - +from scrapy.http import Response from fetch import IFetcher @@ -11,6 +11,7 @@ class CrawlResult(NamedTuple): source: str targeted: bool = None first_keyword_hit: str = None + crawl_depth: int = 0 class ICrawler(ABC): diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/crawl/scrapymodules/HesitantSpider.py new file mode 100644 index 0000000..e535647 --- /dev/null +++ b/src/crawl/scrapymodules/HesitantSpider.py @@ -0,0 +1,141 @@ +from typing import List +import scrapy +import validators +from urllib.parse import urlparse, urljoin +import logging +import re +from .ScrapyResult import ScrapyResult + +class HesitantSpider(scrapy.Spider): + name = "hesitant-spider" + + # Define custom settings as a class attribute + custom_settings = { + "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "AUTOTHROTTLE_ENABLED": True, # Auto throttle to maximize speed without risking blocks + "AUTOTHROTTLE_START_DELAY": 1.0, # Start slow to "warm up" + "AUTOTHROTTLE_MAX_DELAY": 10.0, # Never wait more than 10s + "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, # Aim for 1 request per worker at a time + "DOWNLOAD_DELAY": 0, # Let Autothrottle handle the delay + } + + def __init__( + self, + start_urls: str, + target_keywords: List[str] = [], + add_sitemap_urls: bool = False, + max_depth: int = 1, + skip_domains: List[str] = [], + *args, **kwargs + ): + super(HesitantSpider, self).__init__(*args, **kwargs) + + self.start_urls = start_urls + self.logger.debug(f"Init start_urls: {self.start_urls}") + self.max_depth = max_depth + self.logger.debug(f"Init max depth: {self.max_depth}") + self.skip_domains = skip_domains + self.logger.debug(f"Init skip domains: {self.skip_domains}") + self.target_keywords = target_keywords + self.logger.debug(f"Init target keywords: {self.target_keywords}") + + self._unsupported = ( + ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps", + ".woff2", ".svg", ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".aiff", ".3gp", ".asf", ".asx", ".avi", ".mp4", + ".woff", ".mpg", ".qt", ".rm", ".swf", ".wmv", ".m4a", ".css", ".pdf", ".doc", ".docx", ".exe", ".bin", ".rss", ".zip", + ".rar", ".msu", ".flv", ".dmg", ".xls", ".xlsx", ".ico", ".mng?download=true", ".pct?download=true", ".bmp?download=true", + ".gif?download=true", ".jpg?download=true", ".jpeg?download=true", ".png?download=true", ".pst?download=true", + ".psp?download=true", ".tif?download=true", ".tiff?download=true", ".ai?download=true", ".drw?download=true", + ".dxf?download=true", ".eps?download=true", ".ps?download=true", ".svg?download=true", ".mp3?download=true", + ".wma?download=true", ".ogg?download=true", ".wav?download=true", ".ra?download=true", ".aac?download=true", + ".mid?download=true", ".au?download=true", ".aiff?download=true", ".3gp?download=true", ".asf?download=true", + ".asx?download=true", ".avi?download=true", ".mov?download=true", ".mp4?download=true", ".mpg?download=true", + ".qt?download=true", ".rm?download=true", ".swf?download=true", ".wmv?download=true", ".m4a?download=true", + ".css?download=true", ".pdf?download=true", ".doc?download=true", ".exe?download=true", ".bin?download=true", + ".rss?download=true", ".zip?download=true", ".rar?download=true", ".msu?download=true", ".flv?download=true", + ".dmg?download=true") + self.logger.debug(f"URLs will be excluded if they contain any in path:{', '.join(self._unsupported)}") + + self.results = [] + self.visited = set() + + if max_depth < 0: + self.logger.debug("Only urls from starting_url can be found, max_depth < 0") + + def url_is_target(self, url: str) -> bool: + for keyword in self.target_keywords: + first_keyword_hit = re.search(keyword, url) + if first_keyword_hit is not None: + return True + + def skip_this_url(self, url: str) -> bool: + """Function to see if we have already visited url""" + + if not validators.url(url): + return True + + if any(ext in url for ext in self._unsupported): + self.logger.debug(f"Skip {url}, because extension is unsupported") + return True + + # prevent duplicate crawl from trailing forward slash in URL + url = url.rstrip('/') if url.endswith('/') else url + + # prevent duplicate crawl from '#' such as '#content', '#main', etc. + url = url.rstrip("#") if "#" in url else url + + if any([skip_domain in url for skip_domain in self.skip_domains]): + self.logger.debug(f"Skip {url}, because domain is in skip-list") + return True # skip + + # Do not revisit pages + if url in self.visited: + self.logger.debug(f"Skip {url}, because we have visited it before") + return True # skip + return False + + async def start(self): + for start_url in self.start_urls: + yield scrapy.Request(url=start_url, callback=self.parse, meta={"depth": 0}) + + def parse(self, response): + self.logger.debug(f"Parsing url: {response.url}") + self.visited.add(response.url) + + yield {"url": response.url, "html":response.text[:10]} + current_depth = response.meta.get("depth", 0) + if not self.url_is_target(response.url) and current_depth >= self.max_depth: + return + + # Process the current page + if self.url_is_target(response.url): + # Add results + self.results.append( + ScrapyResult( + url=response.url, + status=response.status, + text=response.text[:1], + crawl_depth=current_depth + ) + ) + + # Reset current depth because we found target at current page + current_depth = 0 + + # Extract and follow links + for link in response.css("a::attr(href)").getall(): + url = urljoin(response.url, link) + + # Keep crawling restricted to the start domain and avoid skipped domains + if self.skip_this_url(url): + continue + + yield scrapy.Request( + url=url, + callback=self.parse, + meta={"depth": current_depth + 1} + ) + + def closed(self, reason): + """Optional: Scrapy built-in method called when the spider finishes""" + print(f"Spider closed because of: {reason}. Total collected pages: {len(self.results)}") \ No newline at end of file diff --git a/src/crawl/scrapymodules/ScrapyCrawlMiddleware.py b/src/crawl/scrapymodules/ScrapyCrawlMiddleware.py new file mode 100644 index 0000000..7ce42ea --- /dev/null +++ b/src/crawl/scrapymodules/ScrapyCrawlMiddleware.py @@ -0,0 +1,27 @@ +import logging +from scrapy.exceptions import IgnoreRequest + +exceptions = [ + ".txt", + ".xml", + ".rss" +] + + +class TextTypeFilterMiddleware: + """ + Drops any response that isn't HTML or XHTML. + """ + def process_response(self, request, response, spider): + if any([response.url.endswith(exception) for exception in exceptions]): + logging.debug(f"Making exception bypass for url: {response.url}") + return response + content_type = response.headers.get('Content-Type', b'').decode('utf-8').lower() + + # Only allow HTML-based content + if 'text/html' not in content_type and 'application/xhtml+xml' not in content_type and 'application/xml' not in content_type: + logging.info(f"\t\tTextTypeFilterMiddleware: Skipping non-text content: {response.url} ({content_type})") + # Returning None tells Scrapy to drop this response entirely + raise IgnoreRequest("Not Text type response, ignore request") + + return response diff --git a/src/crawl/scrapymodules/ScrapyResult.py b/src/crawl/scrapymodules/ScrapyResult.py new file mode 100644 index 0000000..f1be36b --- /dev/null +++ b/src/crawl/scrapymodules/ScrapyResult.py @@ -0,0 +1,7 @@ +from typing import NamedTuple + +class ScrapyResult(NamedTuple): + url: str + status: str + text: str + crawl_depth: int = 0 diff --git a/src/crawl/scrapymodules/__init__.py b/src/crawl/scrapymodules/__init__.py new file mode 100644 index 0000000..12459fc --- /dev/null +++ b/src/crawl/scrapymodules/__init__.py @@ -0,0 +1,3 @@ +from .HesitantSpider import HesitantSpider +from .ScrapyResult import ScrapyResult +from .ScrapyCrawlMiddleware import TextTypeFilterMiddleware \ No newline at end of file diff --git a/src/main.py b/src/main.py index e7a9b4f..27c7fb3 100644 --- a/src/main.py +++ b/src/main.py @@ -45,4 +45,3 @@ def main(): # CONFIG = setup("../config/config.yaml") # df = pd.read_parquet(f"{CONFIG.output.output_dir}/20260304_080625", engine="pyarrow") # print(df.head()) - diff --git a/src/main_scrapy.py b/src/main_scrapy.py new file mode 100644 index 0000000..a154eb9 --- /dev/null +++ b/src/main_scrapy.py @@ -0,0 +1,118 @@ +import os +import logging +import numpy as np +import multiprocessing +import sys +from datetime import datetime + +from scrapy.crawler import CrawlerProcess + +from util import setup, normalize_url +from crawl.scrapymodules import HesitantSpider + +CONFIG = setup("config/config.yaml") + + +def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, logfile): + print(f"Args: urls: {urls}, keywords: {keywords}, skip_domains: {skip_domains}, process_id: {process_id} ") + print(f"Starting crawling process (PID: {process_id}, OSPID: {os.getpid()}) for {urls}!") + project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) + if project_root not in sys.path: + sys.path.insert(0, project_root) + + process = CrawlerProcess( + settings={ + "ROBOTSTXT_OBEY": True, + "LOG_LEVEL": "INFO", + "LOG_FILE": logfile, + "DOWNLOADER_MIDDLEWARES": { + "src.crawl.scrapymodules.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority + }, + "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"] + } + ) + + root_logger = logging.getLogger() + root_logger.setLevel(log_level) + root_logger.handlers = [] + + fileHandler = logging.FileHandler(logfile) + fileHandler.setLevel(log_level) + root_logger.addHandler(fileHandler) + + # Remove console output + # We get the logger that Scrapy uses and remove all handlers that print to the console + scrapy_logger = logging.getLogger('scrapy') + for handler in scrapy_logger.handlers[:]: + scrapy_logger.removeHandler(handler) + + # (Optional) If you want to be extremely thorough, silence the engine too + logging.getLogger('twisted').handlers = [] + + spiderCrawler = process.create_crawler(HesitantSpider) + + process.crawl( + spiderCrawler, + start_urls=urls, + max_depth=1, + target_keywords=keywords, + skip_domains=skip_domains + ) + + if len(urls) == 0: + return [] + + try: + process.start() + except Exception as e: + print(f"Got here! Error {e}") + + if spiderCrawler.spider is not None: + print(f"Returning results of length for PID {process_id}: {len(spiderCrawler.spider.results)}") + return spiderCrawler.spider.results + + +if __name__ == "__main__": + + # Input URLs + file_urls = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.urls}" + logging.info(f"Reading list of base-urls from file: {file_urls}") + with open(file_urls, 'r', encoding='utf-8') as file_in: + urls = [line.rstrip() for line in file_in] + + # Normalize URLs + urls = [*map(normalize_url, urls)] + + # Keywords + file_keywords = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}" + logging.info(f"Reading list of keywords from file: {file_keywords}") + with open(file_keywords, 'r', encoding='utf-8') as file_in: + target_keywords = [line.rstrip() for line in file_in] + + # Skip domains + file_skip_domains = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}" + logging.info(f"Reading list of skip_domains from file: {file_skip_domains}") + with open(file_skip_domains, 'r', encoding='utf-8') as file_in: + skip_domains = [line.rstrip() for line in file_in] + + num_workers = min([len(urls), 6]) + batch_size = len(urls) // num_workers if len(urls) > num_workers else 1 + url_chunks = np.array_split(urls, num_workers) + + chunked_args = [] + + logfile = f"output/logs/log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log" + + for i in range(0, num_workers): + chunked_args.append( + (url_chunks[i], target_keywords, skip_domains, i, logging.INFO, logfile) + ) + + print("# Workers:", num_workers) + + print("# Cores available:", multiprocessing.cpu_count()) + with multiprocessing.Pool(processes=num_workers) as pool: + results = sum(pool.starmap(spawn_spider_process, chunked_args), []) + + print("Results:", results) + print("#Results:", len(results)) diff --git a/src/scrape/__init__.py b/src/scrape/__init__.py index ded790d..4ebab41 100644 --- a/src/scrape/__init__.py +++ b/src/scrape/__init__.py @@ -16,12 +16,17 @@ def build_webfocusedscraper(user_agent: str) -> IScraper: with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}", 'r', encoding='utf-8') as file_in: target_keywords = [line.rstrip() for line in file_in] + with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}", 'r', encoding='utf-8') as file_in: + skip_domains = [line.rstrip() for line in file_in] + fetcher = HTMLFetcher(user_agent=user_agent) crawler = HesitantCrawler( fetcher=fetcher, target_keywords=target_keywords, add_sitemapurls=CONFIG.crawl.use_sitemap, - max_depth=CONFIG.crawl.max_depth) + max_depth=CONFIG.crawl.max_depth, + skip_domains=skip_domains + ) htmlparser = HTMLBodyParser() return Scraper( diff --git a/src/util/__init__.py b/src/util/__init__.py index 46cd90e..bb2a8cc 100644 --- a/src/util/__init__.py +++ b/src/util/__init__.py @@ -1 +1,2 @@ -from .setup import setup \ No newline at end of file +from .setup import setup +from .urls import normalize_url \ No newline at end of file diff --git a/src/util/urls.py b/src/util/urls.py new file mode 100644 index 0000000..62cb112 --- /dev/null +++ b/src/util/urls.py @@ -0,0 +1,35 @@ +from urllib.parse import urlparse, urlunparse +import re + + +# Normalize URL to make sure crawler can handle it without issue +def normalize_url(url): + # Handle case where there is no scheme at all + if not re.match(r'^[a-zA-Z]+://', url): + url = 'https://' + url + + parsed = urlparse(url) + + # 2. Force HTTPS + scheme = 'https' + + # 3. Handle the domain (netloc) + netloc = parsed.netloc.lower() + + # Remove existing 'www.' to re-add cleanly + if netloc.startswith('www.'): + netloc = netloc[4:] + + netloc = 'www.' + netloc + + # Reconstruct URL + new_url = urlunparse(( + scheme, + netloc, + parsed.path, + parsed.params, + parsed.query, + parsed.fragment + )) + + return new_url