SNStatComp · lhaarman · May 19, 2026 · May 21, 2026
diff --git a/config/config_template.yaml b/config/config_template.yaml
@@ -14,6 +14,7 @@ requests:
 input:
   input_dir: ../input
   input_files:
+    skip_domains:
     urls: urls.txt
     keywords: keywords.txt
   url_max: 100

diff --git a/src/crawl/HesitantCrawler.py b/src/crawl/HesitantCrawler.py
@@ -21,7 +21,8 @@ def __init__(
             fetcher: HTMLFetcher,
             target_keywords: List[str],
             add_sitemapurls: bool = False,
-            max_depth: int = 1):
+            max_depth: int = 1,
+            skip_domains: List[str] = []):
         """
         Depth-limited Search Targeted Crawler
         Crawler class for obtaining urls from start_url.
@@ -64,6 +65,10 @@ def __init__(
         self.target_keywords = target_keywords
         logging.info(f"The targeted crawl will look for given keywords: {', '.join(self.target_keywords)}")
 
+        # Skip domains
+        self.skip_domains = skip_domains
+        logging.info(f"The targeted crawl will skip domains: {', '.join(self.skip_domains)}")
+
         # Excluded URLs which contain:
         self._unsupported = (
             ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps",
@@ -91,6 +96,13 @@ def skip_this_url(self, url: str) -> bool:
         # prevent duplicate crawl from trailing forward slash in URL
         url = url.rstrip('/') if url.endswith('/') else url
 
+        # prevent duplicate crawl from '#' such as '#content', '#main', etc.
+        url = url.rstrip("#") if url.contains("#") else url
+
+        if any([skip_domain in url for skip_domain in self.skip_domains]):
+            logging.debug(f"Skip {url}, because domain is in skip-list")
+            return True # skip
+
         # Do not revisit pages
         if url in self._visited:
             logging.debug(f"Skip {url}, because we have visited it before")

diff --git a/src/crawl/__init__.py b/src/crawl/__init__.py
@@ -1,2 +1,3 @@
 from .base import ICrawler, NoCrawler, BaseCrawler, CrawlResult
-from .HesitantCrawler import HesitantCrawler
+from .HesitantCrawler import HesitantCrawler
+from .scrapymodules import ScrapyResult
diff --git a/src/crawl/base.py b/src/crawl/base.py
@@ -2,7 +2,7 @@
 from typing import NamedTuple, List
 import logging
 from urllib.parse import urlparse
-
+from scrapy.http import Response
 from fetch import IFetcher
 
 
@@ -11,6 +11,7 @@ class CrawlResult(NamedTuple):
     source: str
     targeted: bool = None
     first_keyword_hit: str = None
+    crawl_depth: int = 0
 
 
 class ICrawler(ABC):

diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/crawl/scrapymodules/HesitantSpider.py
@@ -0,0 +1,141 @@
+from typing import List
+import scrapy
+import validators
+from urllib.parse import urlparse, urljoin
+import logging
+import re
+from .ScrapyResult import ScrapyResult
+
+class HesitantSpider(scrapy.Spider):
+    name = "hesitant-spider"
+
+    # Define custom settings as a class attribute
+    custom_settings = {
+        "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+        "AUTOTHROTTLE_ENABLED": True, # Auto throttle to maximize speed without risking blocks
+        "AUTOTHROTTLE_START_DELAY": 1.0,  # Start slow to "warm up"
+        "AUTOTHROTTLE_MAX_DELAY": 10.0,   # Never wait more than 10s
+        "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0,  # Aim for 1 request per worker at a time
+        "DOWNLOAD_DELAY": 0,               # Let Autothrottle handle the delay
+    }
+
+    def __init__(
+        self,
+        start_urls: str,
+        target_keywords: List[str] = [],
+        add_sitemap_urls: bool = False,
+        max_depth: int = 1,
+        skip_domains: List[str] = [],
+        *args, **kwargs
+    ):
+        super(HesitantSpider, self).__init__(*args, **kwargs)
+
+        self.start_urls = start_urls
+        self.logger.debug(f"Init start_urls: {self.start_urls}")
+        self.max_depth = max_depth
+        self.logger.debug(f"Init max depth: {self.max_depth}")
+        self.skip_domains = skip_domains
+        self.logger.debug(f"Init skip domains: {self.skip_domains}")
+        self.target_keywords = target_keywords
+        self.logger.debug(f"Init target keywords: {self.target_keywords}")
+
+        self._unsupported = (
+            ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps",
+            ".woff2", ".svg", ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".aiff", ".3gp", ".asf", ".asx", ".avi", ".mp4",
+            ".woff", ".mpg", ".qt", ".rm", ".swf", ".wmv", ".m4a", ".css", ".pdf", ".doc", ".docx", ".exe", ".bin", ".rss", ".zip",
+            ".rar", ".msu", ".flv", ".dmg", ".xls", ".xlsx", ".ico", ".mng?download=true", ".pct?download=true", ".bmp?download=true",
+            ".gif?download=true", ".jpg?download=true", ".jpeg?download=true", ".png?download=true", ".pst?download=true",
+            ".psp?download=true", ".tif?download=true", ".tiff?download=true", ".ai?download=true", ".drw?download=true",
+            ".dxf?download=true", ".eps?download=true", ".ps?download=true", ".svg?download=true", ".mp3?download=true",
+            ".wma?download=true", ".ogg?download=true", ".wav?download=true", ".ra?download=true", ".aac?download=true",
+            ".mid?download=true", ".au?download=true", ".aiff?download=true", ".3gp?download=true", ".asf?download=true",
+            ".asx?download=true", ".avi?download=true", ".mov?download=true", ".mp4?download=true", ".mpg?download=true",
+            ".qt?download=true", ".rm?download=true", ".swf?download=true", ".wmv?download=true", ".m4a?download=true",
+            ".css?download=true", ".pdf?download=true", ".doc?download=true", ".exe?download=true", ".bin?download=true",
+            ".rss?download=true", ".zip?download=true", ".rar?download=true", ".msu?download=true", ".flv?download=true",
+            ".dmg?download=true")
+        self.logger.debug(f"URLs will be excluded if they contain any in path:{', '.join(self._unsupported)}")
+
+        self.results = []
+        self.visited = set()
+
+        if max_depth < 0:
+            self.logger.debug("Only urls from starting_url can be found, max_depth < 0")
+
+    def url_is_target(self, url: str) -> bool:
+        for keyword in self.target_keywords:
+            first_keyword_hit = re.search(keyword, url)
+            if first_keyword_hit is not None:
+                return True
+
+    def skip_this_url(self, url: str) -> bool:
+        """Function to see if we have already visited url"""
+
+        if not validators.url(url):
+            return True
+
+        if any(ext in url for ext in self._unsupported):
+            self.logger.debug(f"Skip {url}, because extension is unsupported")
+            return True
+
+        # prevent duplicate crawl from trailing forward slash in URL
+        url = url.rstrip('/') if url.endswith('/') else url
+
+        # prevent duplicate crawl from '#' such as '#content', '#main', etc.
+        url = url.rstrip("#") if "#" in url else url
+
+        if any([skip_domain in url for skip_domain in self.skip_domains]):
+            self.logger.debug(f"Skip {url}, because domain is in skip-list")
+            return True  # skip
+
+        # Do not revisit pages
+        if url in self.visited:
+            self.logger.debug(f"Skip {url}, because we have visited it before")
+            return True  # skip
+        return False
+
+    async def start(self):
+        for start_url in self.start_urls:
+            yield scrapy.Request(url=start_url, callback=self.parse, meta={"depth": 0})
+
+    def parse(self, response):
+        self.logger.debug(f"Parsing url: {response.url}")
+        self.visited.add(response.url)
+
+        yield {"url": response.url, "html":response.text[:10]} 
+        current_depth = response.meta.get("depth", 0)
+        if not self.url_is_target(response.url) and current_depth >= self.max_depth:
+            return
+
+        # Process the current page
+        if self.url_is_target(response.url):
+            # Add results
+            self.results.append(
+                ScrapyResult(
+                    url=response.url,
+                    status=response.status,
+                    text=response.text[:1],
+                    crawl_depth=current_depth
+                )
+            )
+
+            # Reset current depth because we found target at current page
+            current_depth = 0
+
+        # Extract and follow links
+        for link in response.css("a::attr(href)").getall():
+            url = urljoin(response.url, link)
+
+            # Keep crawling restricted to the start domain and avoid skipped domains
+            if self.skip_this_url(url):
+                continue
+
+            yield scrapy.Request(
+                url=url, 
+                callback=self.parse, 
+                meta={"depth": current_depth + 1}
+            )
+
+    def closed(self, reason):
+        """Optional: Scrapy built-in method called when the spider finishes"""
+        print(f"Spider closed because of: {reason}. Total collected pages: {len(self.results)}")
diff --git a/src/crawl/scrapymodules/ScrapyCrawlMiddleware.py b/src/crawl/scrapymodules/ScrapyCrawlMiddleware.py
@@ -0,0 +1,27 @@
+import logging
+from scrapy.exceptions import IgnoreRequest
+
+exceptions = [
+    ".txt",
+    ".xml",
+    ".rss"
+]
+
+
+class TextTypeFilterMiddleware:
+    """
+    Drops any response that isn't HTML or XHTML.
+    """
+    def process_response(self, request, response, spider):
+        if any([response.url.endswith(exception) for exception in exceptions]):
+            logging.debug(f"Making exception bypass for url: {response.url}")
+            return response
+        content_type = response.headers.get('Content-Type', b'').decode('utf-8').lower()
+
+        # Only allow HTML-based content
+        if 'text/html' not in content_type and 'application/xhtml+xml' not in content_type and 'application/xml' not in content_type:
+            logging.info(f"\t\tTextTypeFilterMiddleware: Skipping non-text content: {response.url} ({content_type})")
+            # Returning None tells Scrapy to drop this response entirely
+            raise IgnoreRequest("Not Text type response, ignore request")
+
+        return response
diff --git a/src/crawl/scrapymodules/ScrapyResult.py b/src/crawl/scrapymodules/ScrapyResult.py
@@ -0,0 +1,7 @@
+from typing import NamedTuple
+
+class ScrapyResult(NamedTuple):
+    url: str
+    status: str
+    text: str
+    crawl_depth: int = 0
diff --git a/src/crawl/scrapymodules/__init__.py b/src/crawl/scrapymodules/__init__.py
@@ -0,0 +1,3 @@
+from .HesitantSpider import HesitantSpider
+from .ScrapyResult import ScrapyResult
+from .ScrapyCrawlMiddleware import TextTypeFilterMiddleware
diff --git a/src/main.py b/src/main.py
@@ -45,4 +45,3 @@ def main():
     # CONFIG = setup("../config/config.yaml")
     # df = pd.read_parquet(f"{CONFIG.output.output_dir}/20260304_080625", engine="pyarrow")
     # print(df.head())
-
diff --git a/src/main_scrapy.py b/src/main_scrapy.py
@@ -0,0 +1,118 @@
+import os
+import logging
+import numpy as np
+import multiprocessing
+import sys
+from datetime import datetime
+
+from scrapy.crawler import CrawlerProcess
+
+from util import setup, normalize_url
+from crawl.scrapymodules import HesitantSpider
+
+CONFIG = setup("config/config.yaml")
+
+
+def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, logfile):
+    print(f"Args: urls: {urls}, keywords: {keywords}, skip_domains: {skip_domains}, process_id: {process_id} ")
+    print(f"Starting crawling process (PID: {process_id}, OSPID: {os.getpid()}) for {urls}!")
+    project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+    if project_root not in sys.path:
+        sys.path.insert(0, project_root)
+
+    process = CrawlerProcess(
+        settings={
+            "ROBOTSTXT_OBEY": True,
+            "LOG_LEVEL": "INFO",
+            "LOG_FILE": logfile,
+            "DOWNLOADER_MIDDLEWARES": {
+                "src.crawl.scrapymodules.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543  # High priority
+            },
+            "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"]
+        }
+    )
+
+    root_logger = logging.getLogger()
+    root_logger.setLevel(log_level)
+    root_logger.handlers = []
+
+    fileHandler = logging.FileHandler(logfile)
+    fileHandler.setLevel(log_level)
+    root_logger.addHandler(fileHandler)
+
+    # Remove console output
+    # We get the logger that Scrapy uses and remove all handlers that print to the console
+    scrapy_logger = logging.getLogger('scrapy')
+    for handler in scrapy_logger.handlers[:]:
+        scrapy_logger.removeHandler(handler)
+
+    # (Optional) If you want to be extremely thorough, silence the engine too
+    logging.getLogger('twisted').handlers = []
+
+    spiderCrawler = process.create_crawler(HesitantSpider)
+
+    process.crawl(
+        spiderCrawler,
+        start_urls=urls,
+        max_depth=1,
+        target_keywords=keywords,
+        skip_domains=skip_domains
+    )
+
+    if len(urls) == 0:
+        return []
+
+    try:
+        process.start()
+    except Exception as e:
+        print(f"Got here! Error {e}")
+
+    if spiderCrawler.spider is not None:
+        print(f"Returning results of length for PID {process_id}: {len(spiderCrawler.spider.results)}")
+    return spiderCrawler.spider.results
+
+
+if __name__ == "__main__":
+
+    # Input URLs
+    file_urls = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.urls}"
+    logging.info(f"Reading list of base-urls from file: {file_urls}")
+    with open(file_urls, 'r', encoding='utf-8') as file_in:
+        urls = [line.rstrip() for line in file_in]
+
+    # Normalize URLs
+    urls = [*map(normalize_url, urls)]
+
+    # Keywords
+    file_keywords = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}"
+    logging.info(f"Reading list of keywords from file: {file_keywords}")
+    with open(file_keywords, 'r', encoding='utf-8') as file_in:
+        target_keywords = [line.rstrip() for line in file_in]
+
+    # Skip domains
+    file_skip_domains = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}"
+    logging.info(f"Reading list of skip_domains from file: {file_skip_domains}")
+    with open(file_skip_domains, 'r', encoding='utf-8') as file_in:
+        skip_domains = [line.rstrip() for line in file_in]
+
+    num_workers = min([len(urls), 6])
+    batch_size = len(urls) // num_workers if len(urls) > num_workers else 1
+    url_chunks = np.array_split(urls, num_workers)
+
+    chunked_args = []
+
+    logfile = f"output/logs/log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log"
+
+    for i in range(0, num_workers):
+        chunked_args.append(
+            (url_chunks[i], target_keywords, skip_domains, i, logging.INFO, logfile)
+        )
+
+    print("# Workers:", num_workers)
+
+    print("# Cores available:", multiprocessing.cpu_count())
+    with multiprocessing.Pool(processes=num_workers) as pool:
+        results = sum(pool.starmap(spawn_spider_process, chunked_args), [])
+
+    print("Results:", results)
+    print("#Results:", len(results))
diff --git a/src/scrape/__init__.py b/src/scrape/__init__.py
@@ -16,12 +16,17 @@ def build_webfocusedscraper(user_agent: str) -> IScraper:
     with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}", 'r', encoding='utf-8') as file_in:
         target_keywords = [line.rstrip() for line in file_in]
 
+    with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}", 'r', encoding='utf-8') as file_in:
+        skip_domains = [line.rstrip() for line in file_in]
+
     fetcher = HTMLFetcher(user_agent=user_agent)
     crawler = HesitantCrawler(
         fetcher=fetcher,
         target_keywords=target_keywords,
         add_sitemapurls=CONFIG.crawl.use_sitemap,
-        max_depth=CONFIG.crawl.max_depth)
+        max_depth=CONFIG.crawl.max_depth,
+        skip_domains=skip_domains
+    )
     htmlparser = HTMLBodyParser()
 
     return Scraper(

diff --git a/src/util/__init__.py b/src/util/__init__.py
@@ -1 +1,2 @@
-from .setup import setup
+from .setup import setup
+from .urls import normalize_url
Original file line number	Diff line number	Diff line change
Expand Up		@@ -45,4 +45,3 @@ def main():
		# CONFIG = setup("../config/config.yaml")
		# df = pd.read_parquet(f"{CONFIG.output.output_dir}/20260304_080625", engine="pyarrow")
		# print(df.head())