From 03bf6fd5faec78517c572e05c187aef8a1074c2a Mon Sep 17 00:00:00 2001 From: Paras Sharma <130871046+Paras20222@users.noreply.github.com> Date: Thu, 2 Jan 2025 21:32:38 +0530 Subject: [PATCH] Added scraper, documentation, and sample data --- config.json | 17 ++ data/editorials/1352A.txt | 1 + data/problems/1352A.txt | 37 ++++ docs/README.md | 31 +++ main.ipynb | 402 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 4 + 6 files changed, 492 insertions(+) create mode 100644 config.json create mode 100644 data/editorials/1352A.txt create mode 100644 data/problems/1352A.txt create mode 100644 docs/README.md create mode 100644 main.ipynb create mode 100644 requirements.txt diff --git a/config.json b/config.json new file mode 100644 index 00000000..d7fd6ffc --- /dev/null +++ b/config.json @@ -0,0 +1,17 @@ +{ + "rate_limit_delay": 2, + "max_problems": 10, + "paths": { + "root": "C:\\Users\\DELL\\Desktop\\codeforces-scraper", + "data": "data", + "problems": "data/problems", + "editorials": "data/editorials", + "metadata": "data/metadata", + "docs": "docs", + "samples": "data/samples" + }, + "selenium": { + "headless": true, + "timeout": 10 + } +} \ No newline at end of file diff --git a/data/editorials/1352A.txt b/data/editorials/1352A.txt new file mode 100644 index 00000000..5ad2434b --- /dev/null +++ b/data/editorials/1352A.txt @@ -0,0 +1 @@ +1352A - Sum of Round Numbers Tutorial1352A - Sum of Round NumbersFirstly, we need to understand the minimum amount of round numbers we need to represent 𝑛. It equals the number of non-zero digits in 𝑛. Why? Because we can "remove" exactly one non-zero digit in 𝑛 using exactly one round number (so we need at most this amount of round numbers) and, on the other hand, the sum of two round numbers has at most two non-zero digits (the sum of three round numbers has at most three non-zero digits and so on) so this is useless to try to remove more than one digit using the sum of several round numbers.So we need to find all digits of 𝑛 and print the required number for each of these digits. For example, if 𝑛=103 then 𝑛=1⋅102+0⋅101+3⋅100, so we need two round numbers: 1⋅102 and 3⋅100.Because the last digit of 𝑛 is 𝑛%10 (the remainder of 𝑛 modulo 10) and we can remove the last digit of the number by integer division on 10, we can use the following code to solve the problem:int n;cin >> n;vector ans;int power = 1;while (n > 0) { if (n % 10 > 0) { ans.push_back((n % 10) * power); } n /= 10; power *= 10;}cout << ans.size() << endl;for (auto number : ans) cout << number << " ";cout << endl; Solution#include diff --git a/data/problems/1352A.txt b/data/problems/1352A.txt new file mode 100644 index 00000000..a7a0fec5 --- /dev/null +++ b/data/problems/1352A.txt @@ -0,0 +1,37 @@ +A positive (strictly greater than zero) integer is called round if it is of the form d00...0. In other words, a positive integer is round if all its digits except the leftmost (most significant) are equal to zero. In particular, all numbers from 1 + to 9 + (inclusive) are round. + +For example, the following numbers are round: 4000 +, 1 +, 9 +, 800 +, 90 +. The following numbers are not round: 110 +, 707 +, 222 +, 1001 +. + +You are given a positive integer n + (1≤n≤104 +). Represent the number n + as a sum of round numbers using the minimum number of summands (addends). In other words, you need to represent the given number n + as a sum of the least number of terms, each of which is a round number. + +Input +The first line contains an integer t + (1≤t≤104 +) — the number of test cases in the input. Then t + test cases follow. + +Each test case is a line containing an integer n + (1≤n≤104 +). + +Output +Print t + answers to the test cases. Each answer must begin with an integer k + — the minimum number of summands. Next, k + terms must follow, each of which is a round number, and their sum is n +. The terms can be printed in any order. If there are several answers, print any of them. \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..33f38d11 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,31 @@ +# Codeforces Problem Scraper + +## Overview +This tool scrapes problem statements and editorials from Codeforces, preserving LaTeX formatting and code blocks. + +## Features +- Extracts problem statements with preserved LaTeX formatting +- Captures test cases and sample inputs/outputs +- Preserves code blocks with proper formatting +- Stores metadata in JSON format +- Includes editorial content with proper section handling + +## Project Structure +``` +project/ +├── data/ +│ ├── problems/ +│ ├── editorials/ +│ ├── metadata/ +│ └── samples/ +└── docs/ +``` + +## Usage +Run `main.py` to start the scraper: +```bash +python main.py +``` + +## Configuration +Settings can be modified in `config.json` diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 00000000..ffdf4da5 --- /dev/null +++ b/main.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KJe0CtM7BnGu", + "outputId": "e447045e-1f12-4571-f236-08bb42235e2c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting selenium\n", + " Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)\n", + "Requirement already satisfied: urllib3<3,>=1.26 in /usr/local/lib/python3.10/dist-packages (from urllib3[socks]<3,>=1.26->selenium) (2.2.3)\n", + "Collecting trio~=0.17 (from selenium)\n", + " Downloading trio-0.28.0-py3-none-any.whl.metadata (8.5 kB)\n", + "Collecting trio-websocket~=0.9 (from selenium)\n", + " Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)\n", + "Requirement already satisfied: certifi>=2021.10.8 in /usr/local/lib/python3.10/dist-packages (from selenium) (2024.12.14)\n", + "Requirement already satisfied: typing_extensions~=4.9 in /usr/local/lib/python3.10/dist-packages (from selenium) (4.12.2)\n", + "Requirement already satisfied: websocket-client~=1.8 in /usr/local/lib/python3.10/dist-packages (from selenium) (1.8.0)\n", + "Requirement already satisfied: attrs>=23.2.0 in /usr/local/lib/python3.10/dist-packages (from trio~=0.17->selenium) (24.3.0)\n", + "Collecting sortedcontainers (from trio~=0.17->selenium)\n", + " Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)\n", + "Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from trio~=0.17->selenium) (3.10)\n", + "Collecting outcome (from trio~=0.17->selenium)\n", + " Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)\n", + "Requirement already satisfied: sniffio>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from trio~=0.17->selenium) (1.3.1)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from trio~=0.17->selenium) (1.2.2)\n", + "Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)\n", + " Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)\n", + "Requirement already satisfied: pysocks!=1.5.7,<2.0,>=1.5.6 in /usr/local/lib/python3.10/dist-packages (from urllib3[socks]<3,>=1.26->selenium) (1.7.1)\n", + "Requirement already satisfied: h11<1,>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from wsproto>=0.14->trio-websocket~=0.9->selenium) (0.14.0)\n", + "Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.7/9.7 MB\u001b[0m \u001b[31m75.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading trio-0.28.0-py3-none-any.whl (486 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.3/486.3 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)\n", + "Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)\n", + "Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)\n", + "Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\n", + "Installing collected packages: sortedcontainers, wsproto, outcome, trio, trio-websocket, selenium\n", + "Successfully installed outcome-1.3.0.post0 selenium-4.27.1 sortedcontainers-2.4.0 trio-0.28.0 trio-websocket-0.11.1 wsproto-1.2.0\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import time\n", + "import requests\n", + "import re\n", + "from bs4 import BeautifulSoup\n", + "!pip install selenium\n", + "from selenium import webdriver\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium.webdriver.chrome.service import Service\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from selenium.common.exceptions import TimeoutException\n", + "import shutil\n", + "\n", + "class CodeforcesScraper:\n", + " def __init__(self):\n", + " self.base_url = \"https://codeforces.com\"\n", + " self.setup_project_structure()\n", + " self.setup_selenium()\n", + "\n", + " def setup_project_structure(self):\n", + " # Define project structure\n", + " self.project_dirs = {\n", + " 'root': os.getcwd(),\n", + " 'data': 'data',\n", + " 'problems': 'data/problems',\n", + " 'editorials': 'data/editorials',\n", + " 'metadata': 'data/metadata',\n", + " 'docs': 'docs',\n", + " 'samples': 'data/samples'\n", + " }\n", + "\n", + " # Create directories\n", + " for dir_path in self.project_dirs.values():\n", + " os.makedirs(dir_path, exist_ok=True)\n", + "\n", + " # Create documentation\n", + " self.create_documentation()\n", + "\n", + " # Create config file\n", + " self.create_config()\n", + "\n", + " def create_documentation(self):\n", + " readme_content = \"\"\"# Codeforces Problem Scraper\n", + "\n", + "## Overview\n", + "This tool scrapes problem statements and editorials from Codeforces, preserving LaTeX formatting and code blocks.\n", + "\n", + "## Features\n", + "- Extracts problem statements with preserved LaTeX formatting\n", + "- Captures test cases and sample inputs/outputs\n", + "- Preserves code blocks with proper formatting\n", + "- Stores metadata in JSON format\n", + "- Includes editorial content with proper section handling\n", + "\n", + "## Project Structure\n", + "```\n", + "project/\n", + "├── data/\n", + "│ ├── problems/\n", + "│ ├── editorials/\n", + "│ ├── metadata/\n", + "│ └── samples/\n", + "└── docs/\n", + "```\n", + "\n", + "## Usage\n", + "Run `main.py` to start the scraper:\n", + "```bash\n", + "python main.py\n", + "```\n", + "\n", + "## Configuration\n", + "Settings can be modified in `config.json`\n", + "\"\"\"\n", + " with open(os.path.join(self.project_dirs['docs'], 'README.md'), 'w', encoding='utf-8') as f:\n", + " f.write(readme_content)\n", + "\n", + " def create_config(self):\n", + " config = {\n", + " 'rate_limit_delay': 2,\n", + " 'max_problems': 10,\n", + " 'paths': self.project_dirs,\n", + " 'selenium': {\n", + " 'headless': True,\n", + " 'timeout': 10\n", + " }\n", + " }\n", + " with open('config.json', 'w', encoding='utf-8') as f:\n", + " json.dump(config, f, indent=4)\n", + "\n", + " def setup_selenium(self):\n", + " chrome_options = Options()\n", + " chrome_options.add_argument(\"--headless\")\n", + " chrome_options.add_argument(\"--disable-gpu\")\n", + " chrome_options.add_argument(\"--no-sandbox\")\n", + " chrome_options.add_argument(\"--disable-dev-shm-usage\")\n", + " chrome_options.add_argument(\"--log-level=3\")\n", + " self.driver = webdriver.Chrome(options=chrome_options)\n", + "\n", + " def extract_latex(self, element):\n", + " \"\"\"Preserve LaTeX formatting in text.\"\"\"\n", + " if not element:\n", + " return \"\"\n", + "\n", + " # Convert element to string while preserving LaTeX\n", + " content = str(element)\n", + "\n", + " # Preserve inline LaTeX\n", + " content = re.sub(r'\\$([^$]+)\\$', r'$\\1$', content)\n", + "\n", + " # Preserve block LaTeX\n", + " content = re.sub(r'\\$\\$([^$]+)\\$\\$', r'$$\\1$$', content)\n", + "\n", + " # Convert to plain text while keeping LaTeX\n", + " soup = BeautifulSoup(content, 'html.parser')\n", + " return soup.get_text(separator='\\n')\n", + "\n", + " def extract_code_blocks(self, element):\n", + " \"\"\"Extract and format code blocks.\"\"\"\n", + " code_blocks = []\n", + " if element:\n", + " for pre in element.find_all('pre'):\n", + " code_blocks.append(pre.get_text())\n", + " return code_blocks\n", + "\n", + " def get_test_cases(self, problem_statement):\n", + " \"\"\"Extract test cases from problem statement.\"\"\"\n", + " test_cases = []\n", + " if problem_statement:\n", + " input_specs = problem_statement.find_all('div', class_='input')\n", + " output_specs = problem_statement.find_all('div', class_='output')\n", + "\n", + " for input_spec, output_spec in zip(input_specs, output_specs):\n", + " test_case = {\n", + " 'input': input_spec.find('pre').get_text() if input_spec.find('pre') else \"\",\n", + " 'output': output_spec.find('pre').get_text() if output_spec.find('pre') else \"\"\n", + " }\n", + " test_cases.append(test_case)\n", + "\n", + " return test_cases\n", + "\n", + " def scrape_problem(self, problem_url):\n", + " self.driver.get(problem_url)\n", + " try:\n", + " WebDriverWait(self.driver, 10).until(\n", + " EC.presence_of_element_located((By.CLASS_NAME, \"problem-statement\"))\n", + " )\n", + " except TimeoutException:\n", + " return None\n", + "\n", + " soup = BeautifulSoup(self.driver.page_source, 'html.parser')\n", + " problem_statement = soup.find(\"div\", class_=\"problem-statement\")\n", + "\n", + " if not problem_statement:\n", + " return None\n", + "\n", + " # Extract problem components\n", + " header = problem_statement.find('div', class_='header')\n", + " title = header.find('div', class_='title').text.strip() if header else \"\"\n", + " time_limit = header.find('div', class_='time-limit').text.strip() if header else \"\"\n", + " memory_limit = header.find('div', class_='memory-limit').text.strip() if header else \"\"\n", + "\n", + " # Get main content with preserved LaTeX\n", + " content_div = problem_statement.find('div', class_='header').find_next_sibling('div')\n", + " content = self.extract_latex(content_div)\n", + "\n", + " # Get code blocks\n", + " code_blocks = self.extract_code_blocks(problem_statement)\n", + "\n", + " # Get test cases\n", + " test_cases = self.get_test_cases(problem_statement)\n", + "\n", + " # Get tags\n", + " tags_div = soup.find('div', class_='tag-box')\n", + " tags = [tag.text.strip() for tag in tags_div.find_all('span', class_='tag')] if tags_div else []\n", + "\n", + " return {\n", + " 'title': title,\n", + " 'time_limit': time_limit,\n", + " 'memory_limit': memory_limit,\n", + " 'content': content,\n", + " 'code_blocks': code_blocks,\n", + " 'test_cases': test_cases,\n", + " 'tags': tags,\n", + " 'url': problem_url\n", + " }\n", + "\n", + " def scrape_editorial(self, problem_url):\n", + " editorial_url = problem_url.replace(\"/problem/\", \"/tutorial/\")\n", + " try:\n", + " self.driver.get(editorial_url)\n", + " WebDriverWait(self.driver, 10).until(\n", + " EC.presence_of_element_located((By.ID, \"pageContent\"))\n", + " )\n", + "\n", + " soup = BeautifulSoup(self.driver.page_source, 'html.parser')\n", + " tutorial = soup.find('div', class_='ttypography')\n", + "\n", + " if not tutorial:\n", + " return None\n", + "\n", + " # Extract content with preserved LaTeX\n", + " content = self.extract_latex(tutorial)\n", + "\n", + " # Extract code blocks\n", + " code_blocks = self.extract_code_blocks(tutorial)\n", + "\n", + " # Extract section headers\n", + " sections = []\n", + " for header in tutorial.find_all(['h1', 'h2', 'h3']):\n", + " sections.append({\n", + " 'level': int(header.name[1]),\n", + " 'title': header.text.strip()\n", + " })\n", + "\n", + " return {\n", + " 'content': content,\n", + " 'code_blocks': code_blocks,\n", + " 'sections': sections,\n", + " 'url': editorial_url\n", + " }\n", + " except:\n", + " return None\n", + "\n", + " def save_problem(self, problem_data, problem_number):\n", + " # Save problem statement\n", + " problem_file = os.path.join(self.project_dirs['problems'], f'problem_{problem_number}.txt')\n", + " with open(problem_file, 'w', encoding='utf-8') as f:\n", + " f.write(f\"Title: {problem_data['title']}\\n\")\n", + " f.write(f\"Time Limit: {problem_data['time_limit']}\\n\")\n", + " f.write(f\"Memory Limit: {problem_data['memory_limit']}\\n\\n\")\n", + " f.write(\"Problem Statement:\\n\")\n", + " f.write(problem_data['content'])\n", + "\n", + " if problem_data['code_blocks']:\n", + " f.write(\"\\n\\nCode Blocks:\\n\")\n", + " for i, block in enumerate(problem_data['code_blocks'], 1):\n", + " f.write(f\"\\nBlock {i}:\\n{block}\\n\")\n", + "\n", + " # Save test cases\n", + " if problem_data['test_cases']:\n", + " test_case_file = os.path.join(self.project_dirs['samples'], f'test_cases_{problem_number}.json')\n", + " with open(test_case_file, 'w', encoding='utf-8') as f:\n", + " json.dump(problem_data['test_cases'], f, indent=2)\n", + "\n", + " # Save metadata\n", + " metadata_file = os.path.join(self.project_dirs['metadata'], f'problem_{problem_number}_metadata.json')\n", + " metadata = {\n", + " 'title': problem_data['title'],\n", + " 'tags': problem_data['tags'],\n", + " 'url': problem_data['url'],\n", + " 'time_limit': problem_data['time_limit'],\n", + " 'memory_limit': problem_data['memory_limit']\n", + " }\n", + " with open(metadata_file, 'w', encoding='utf-8') as f:\n", + " json.dump(metadata, f, indent=2)\n", + "\n", + " def save_editorial(self, editorial_data, problem_number):\n", + " if editorial_data:\n", + " editorial_file = os.path.join(self.project_dirs['editorials'], f'editorial_{problem_number}.txt')\n", + " with open(editorial_file, 'w', encoding='utf-8') as f:\n", + " # Write sections\n", + " if editorial_data['sections']:\n", + " f.write(\"Sections:\\n\")\n", + " for section in editorial_data['sections']:\n", + " f.write(f\"{'#' * section['level']} {section['title']}\\n\")\n", + " f.write(\"\\n\")\n", + "\n", + " # Write main content\n", + " f.write(\"Content:\\n\")\n", + " f.write(editorial_data['content'])\n", + "\n", + " # Write code blocks\n", + " if editorial_data['code_blocks']:\n", + " f.write(\"\\n\\nCode Blocks:\\n\")\n", + " for i, block in enumerate(editorial_data['code_blocks'], 1):\n", + " f.write(f\"\\nBlock {i}:\\n{block}\\n\")\n", + "\n", + " def get_problem_list(self, num_problems=10):\n", + " url = f\"{self.base_url}/problemset\"\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + "\n", + " problems = []\n", + " problem_rows = soup.select(\"table.problems tr\")[1:num_problems+1]\n", + "\n", + " for row in problem_rows:\n", + " problem_cells = row.select(\"td\")\n", + " if len(problem_cells) >= 2:\n", + " problem_link = problem_cells[1].find(\"a\")\n", + " if problem_link:\n", + " problem_url = problem_link.get(\"href\")\n", + " if problem_url:\n", + " problems.append(self.base_url + problem_url)\n", + "\n", + " return problems\n", + "\n", + " def run(self, num_problems=10):\n", + " with open('config.json', 'r') as f:\n", + " config = json.load(f)\n", + "\n", + " problems = self.get_problem_list(num_problems)\n", + "\n", + " for i, problem_url in enumerate(problems, 1):\n", + " print(f\"Scraping problem {i}/{len(problems)}: {problem_url}\")\n", + "\n", + " problem_data = self.scrape_problem(problem_url)\n", + " if problem_data:\n", + " self.save_problem(problem_data, i)\n", + "\n", + " editorial_data = self.scrape_editorial(problem_url)\n", + " if editorial_data:\n", + " self.save_editorial(editorial_data, i)\n", + "\n", + " time.sleep(config['rate_limit_delay'])\n", + "\n", + " def cleanup(self):\n", + " self.driver.quit()\n", + "\n", + "def main():\n", + " scraper = CodeforcesScraper()\n", + " try:\n", + " scraper.run(num_problems=10)\n", + " finally:\n", + " scraper.cleanup()\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..0a729349 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4==4.12.2 +selenium==4.16.0 +requests==2.31.0 +webdriver_manager==4.0.1 \ No newline at end of file