Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ requests:
input:
input_dir: ../input
input_files:
skip_domains:
urls: urls.txt
keywords: keywords.txt
url_max: 100
Expand Down
14 changes: 13 additions & 1 deletion src/crawl/HesitantCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def __init__(
fetcher: HTMLFetcher,
target_keywords: List[str],
add_sitemapurls: bool = False,
max_depth: int = 1):
max_depth: int = 1,
skip_domains: List[str] = []):
"""
Depth-limited Search Targeted Crawler
Crawler class for obtaining urls from start_url.
Expand Down Expand Up @@ -64,6 +65,10 @@ def __init__(
self.target_keywords = target_keywords
logging.info(f"The targeted crawl will look for given keywords: {', '.join(self.target_keywords)}")

# Skip domains
self.skip_domains = skip_domains
logging.info(f"The targeted crawl will skip domains: {', '.join(self.skip_domains)}")

# Excluded URLs which contain:
self._unsupported = (
".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps",
Expand Down Expand Up @@ -91,6 +96,13 @@ def skip_this_url(self, url: str) -> bool:
# prevent duplicate crawl from trailing forward slash in URL
url = url.rstrip('/') if url.endswith('/') else url

# prevent duplicate crawl from '#' such as '#content', '#main', etc.
url = url.rstrip("#") if url.contains("#") else url

if any([skip_domain in url for skip_domain in self.skip_domains]):
logging.debug(f"Skip {url}, because domain is in skip-list")
return True # skip

# Do not revisit pages
if url in self._visited:
logging.debug(f"Skip {url}, because we have visited it before")
Expand Down
3 changes: 2 additions & 1 deletion src/crawl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .base import ICrawler, NoCrawler, BaseCrawler, CrawlResult
from .HesitantCrawler import HesitantCrawler
from .HesitantCrawler import HesitantCrawler
from .scrapymodules import ScrapyResult
3 changes: 2 additions & 1 deletion src/crawl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import NamedTuple, List
import logging
from urllib.parse import urlparse

from scrapy.http import Response
from fetch import IFetcher


Expand All @@ -11,6 +11,7 @@ class CrawlResult(NamedTuple):
source: str
targeted: bool = None
first_keyword_hit: str = None
crawl_depth: int = 0


class ICrawler(ABC):
Expand Down
141 changes: 141 additions & 0 deletions src/crawl/scrapymodules/HesitantSpider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
from typing import List
import scrapy
import validators
from urllib.parse import urlparse, urljoin
import logging
import re
from .ScrapyResult import ScrapyResult

class HesitantSpider(scrapy.Spider):
name = "hesitant-spider"

# Define custom settings as a class attribute
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"AUTOTHROTTLE_ENABLED": True, # Auto throttle to maximize speed without risking blocks
"AUTOTHROTTLE_START_DELAY": 1.0, # Start slow to "warm up"
"AUTOTHROTTLE_MAX_DELAY": 10.0, # Never wait more than 10s
"AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, # Aim for 1 request per worker at a time
"DOWNLOAD_DELAY": 0, # Let Autothrottle handle the delay
}

def __init__(
self,
start_urls: str,
target_keywords: List[str] = [],
add_sitemap_urls: bool = False,
max_depth: int = 1,
skip_domains: List[str] = [],
*args, **kwargs
):
super(HesitantSpider, self).__init__(*args, **kwargs)

self.start_urls = start_urls
self.logger.debug(f"Init start_urls: {self.start_urls}")
self.max_depth = max_depth
self.logger.debug(f"Init max depth: {self.max_depth}")
self.skip_domains = skip_domains
self.logger.debug(f"Init skip domains: {self.skip_domains}")
self.target_keywords = target_keywords
self.logger.debug(f"Init target keywords: {self.target_keywords}")

self._unsupported = (
".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps",
".woff2", ".svg", ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".aiff", ".3gp", ".asf", ".asx", ".avi", ".mp4",
".woff", ".mpg", ".qt", ".rm", ".swf", ".wmv", ".m4a", ".css", ".pdf", ".doc", ".docx", ".exe", ".bin", ".rss", ".zip",
".rar", ".msu", ".flv", ".dmg", ".xls", ".xlsx", ".ico", ".mng?download=true", ".pct?download=true", ".bmp?download=true",
".gif?download=true", ".jpg?download=true", ".jpeg?download=true", ".png?download=true", ".pst?download=true",
".psp?download=true", ".tif?download=true", ".tiff?download=true", ".ai?download=true", ".drw?download=true",
".dxf?download=true", ".eps?download=true", ".ps?download=true", ".svg?download=true", ".mp3?download=true",
".wma?download=true", ".ogg?download=true", ".wav?download=true", ".ra?download=true", ".aac?download=true",
".mid?download=true", ".au?download=true", ".aiff?download=true", ".3gp?download=true", ".asf?download=true",
".asx?download=true", ".avi?download=true", ".mov?download=true", ".mp4?download=true", ".mpg?download=true",
".qt?download=true", ".rm?download=true", ".swf?download=true", ".wmv?download=true", ".m4a?download=true",
".css?download=true", ".pdf?download=true", ".doc?download=true", ".exe?download=true", ".bin?download=true",
".rss?download=true", ".zip?download=true", ".rar?download=true", ".msu?download=true", ".flv?download=true",
".dmg?download=true")
self.logger.debug(f"URLs will be excluded if they contain any in path:{', '.join(self._unsupported)}")

self.results = []
self.visited = set()

if max_depth < 0:
self.logger.debug("Only urls from starting_url can be found, max_depth < 0")

def url_is_target(self, url: str) -> bool:
for keyword in self.target_keywords:
first_keyword_hit = re.search(keyword, url)
if first_keyword_hit is not None:
return True

def skip_this_url(self, url: str) -> bool:
"""Function to see if we have already visited url"""

if not validators.url(url):
return True

if any(ext in url for ext in self._unsupported):
self.logger.debug(f"Skip {url}, because extension is unsupported")
return True

# prevent duplicate crawl from trailing forward slash in URL
url = url.rstrip('/') if url.endswith('/') else url

# prevent duplicate crawl from '#' such as '#content', '#main', etc.
url = url.rstrip("#") if "#" in url else url

if any([skip_domain in url for skip_domain in self.skip_domains]):
self.logger.debug(f"Skip {url}, because domain is in skip-list")
return True # skip

# Do not revisit pages
if url in self.visited:
self.logger.debug(f"Skip {url}, because we have visited it before")
return True # skip
return False

async def start(self):
for start_url in self.start_urls:
yield scrapy.Request(url=start_url, callback=self.parse, meta={"depth": 0})

def parse(self, response):
self.logger.debug(f"Parsing url: {response.url}")
self.visited.add(response.url)

yield {"url": response.url, "html":response.text[:10]}
current_depth = response.meta.get("depth", 0)
if not self.url_is_target(response.url) and current_depth >= self.max_depth:
return

# Process the current page
if self.url_is_target(response.url):
# Add results
self.results.append(
ScrapyResult(
url=response.url,
status=response.status,
text=response.text[:1],
crawl_depth=current_depth
)
)

# Reset current depth because we found target at current page
current_depth = 0

# Extract and follow links
for link in response.css("a::attr(href)").getall():
url = urljoin(response.url, link)

# Keep crawling restricted to the start domain and avoid skipped domains
if self.skip_this_url(url):
continue

yield scrapy.Request(
url=url,
callback=self.parse,
meta={"depth": current_depth + 1}
)

def closed(self, reason):
"""Optional: Scrapy built-in method called when the spider finishes"""
print(f"Spider closed because of: {reason}. Total collected pages: {len(self.results)}")
27 changes: 27 additions & 0 deletions src/crawl/scrapymodules/ScrapyCrawlMiddleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import logging
from scrapy.exceptions import IgnoreRequest

exceptions = [
".txt",
".xml",
".rss"
]


class TextTypeFilterMiddleware:
"""
Drops any response that isn't HTML or XHTML.
"""
def process_response(self, request, response, spider):
if any([response.url.endswith(exception) for exception in exceptions]):
logging.debug(f"Making exception bypass for url: {response.url}")
return response
content_type = response.headers.get('Content-Type', b'').decode('utf-8').lower()

# Only allow HTML-based content
if 'text/html' not in content_type and 'application/xhtml+xml' not in content_type and 'application/xml' not in content_type:
logging.info(f"\t\tTextTypeFilterMiddleware: Skipping non-text content: {response.url} ({content_type})")
# Returning None tells Scrapy to drop this response entirely
raise IgnoreRequest("Not Text type response, ignore request")

return response
7 changes: 7 additions & 0 deletions src/crawl/scrapymodules/ScrapyResult.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from typing import NamedTuple

class ScrapyResult(NamedTuple):
url: str
status: str
text: str
crawl_depth: int = 0
3 changes: 3 additions & 0 deletions src/crawl/scrapymodules/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .HesitantSpider import HesitantSpider
from .ScrapyResult import ScrapyResult
from .ScrapyCrawlMiddleware import TextTypeFilterMiddleware
1 change: 0 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,3 @@ def main():
# CONFIG = setup("../config/config.yaml")
# df = pd.read_parquet(f"{CONFIG.output.output_dir}/20260304_080625", engine="pyarrow")
# print(df.head())

118 changes: 118 additions & 0 deletions src/main_scrapy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import os
import logging
import numpy as np
import multiprocessing
import sys
from datetime import datetime

from scrapy.crawler import CrawlerProcess

from util import setup, normalize_url
from crawl.scrapymodules import HesitantSpider

CONFIG = setup("config/config.yaml")


def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, logfile):
print(f"Args: urls: {urls}, keywords: {keywords}, skip_domains: {skip_domains}, process_id: {process_id} ")
print(f"Starting crawling process (PID: {process_id}, OSPID: {os.getpid()}) for {urls}!")
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
if project_root not in sys.path:
sys.path.insert(0, project_root)

process = CrawlerProcess(
settings={
"ROBOTSTXT_OBEY": True,
"LOG_LEVEL": "INFO",
"LOG_FILE": logfile,
"DOWNLOADER_MIDDLEWARES": {
"src.crawl.scrapymodules.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority
},
"DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"]
}
)

root_logger = logging.getLogger()
root_logger.setLevel(log_level)
root_logger.handlers = []

fileHandler = logging.FileHandler(logfile)
fileHandler.setLevel(log_level)
root_logger.addHandler(fileHandler)

# Remove console output
# We get the logger that Scrapy uses and remove all handlers that print to the console
scrapy_logger = logging.getLogger('scrapy')
for handler in scrapy_logger.handlers[:]:
scrapy_logger.removeHandler(handler)

# (Optional) If you want to be extremely thorough, silence the engine too
logging.getLogger('twisted').handlers = []

spiderCrawler = process.create_crawler(HesitantSpider)

process.crawl(
spiderCrawler,
start_urls=urls,
max_depth=1,
target_keywords=keywords,
skip_domains=skip_domains
)

if len(urls) == 0:
return []

try:
process.start()
except Exception as e:
print(f"Got here! Error {e}")

if spiderCrawler.spider is not None:
print(f"Returning results of length for PID {process_id}: {len(spiderCrawler.spider.results)}")
return spiderCrawler.spider.results


if __name__ == "__main__":

# Input URLs
file_urls = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.urls}"
logging.info(f"Reading list of base-urls from file: {file_urls}")
with open(file_urls, 'r', encoding='utf-8') as file_in:
urls = [line.rstrip() for line in file_in]

# Normalize URLs
urls = [*map(normalize_url, urls)]

# Keywords
file_keywords = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}"
logging.info(f"Reading list of keywords from file: {file_keywords}")
with open(file_keywords, 'r', encoding='utf-8') as file_in:
target_keywords = [line.rstrip() for line in file_in]

# Skip domains
file_skip_domains = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}"
logging.info(f"Reading list of skip_domains from file: {file_skip_domains}")
with open(file_skip_domains, 'r', encoding='utf-8') as file_in:
skip_domains = [line.rstrip() for line in file_in]

num_workers = min([len(urls), 6])
batch_size = len(urls) // num_workers if len(urls) > num_workers else 1
url_chunks = np.array_split(urls, num_workers)

chunked_args = []

logfile = f"output/logs/log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log"

for i in range(0, num_workers):
chunked_args.append(
(url_chunks[i], target_keywords, skip_domains, i, logging.INFO, logfile)
)

print("# Workers:", num_workers)

print("# Cores available:", multiprocessing.cpu_count())
with multiprocessing.Pool(processes=num_workers) as pool:
results = sum(pool.starmap(spawn_spider_process, chunked_args), [])

print("Results:", results)
print("#Results:", len(results))
7 changes: 6 additions & 1 deletion src/scrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,17 @@ def build_webfocusedscraper(user_agent: str) -> IScraper:
with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}", 'r', encoding='utf-8') as file_in:
target_keywords = [line.rstrip() for line in file_in]

with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}", 'r', encoding='utf-8') as file_in:
skip_domains = [line.rstrip() for line in file_in]

fetcher = HTMLFetcher(user_agent=user_agent)
crawler = HesitantCrawler(
fetcher=fetcher,
target_keywords=target_keywords,
add_sitemapurls=CONFIG.crawl.use_sitemap,
max_depth=CONFIG.crawl.max_depth)
max_depth=CONFIG.crawl.max_depth,
skip_domains=skip_domains
)
htmlparser = HTMLBodyParser()

return Scraper(
Expand Down
3 changes: 2 additions & 1 deletion src/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .setup import setup
from .setup import setup
from .urls import normalize_url
Loading