diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..23a9951 --- /dev/null +++ b/.gitignore @@ -0,0 +1,38 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ +.venv + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db diff --git a/README.md b/README.md index eaa6abf..be930b4 100644 --- a/README.md +++ b/README.md @@ -1 +1,111 @@ -https://app.chime.com/link/qr?u=Darlie-Withrow \ No newline at end of file +# Actor Data Parser + +This repository contains a Python script to parse actor data from a formatted string. + +## Problem Statement + +Parse actor information from the following format: +``` +actor:Daractor:Darliewithrowliewithrowactor:Darliewithrowactor:Darliewithrow +``` + +## Solution + +The `actor_parser.py` script parses the input string by splitting on the `actor:` delimiter, then intelligently filters out corrupted and fragmented actor names to extract only valid actor names. + +### Usage + +Run with default data: +```bash +python3 actor_parser.py +``` + +Run with data from a file: +```bash +python3 actor_parser.py actor_data.txt +``` + +### Output + +``` +Parsed Actors: +1. Darliewithrow + +Total unique actors: 1 +``` + +## Testing + +Run the test suite: + +```bash +python3 test_actor_parser.py +``` + +## Implementation Details + +- The parser splits the input string by `actor:` delimiter +- Identifies and filters out corrupted names with internal repetitions +- Removes fragment names that are prefixes of longer valid names +- Maintains unique actors in order of first appearance +- Returns a list of valid actor names + +## License + +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. +# Actor Data Parser + +This repository contains a Python script to parse actor data from a formatted string. + +## Problem Statement + +Parse actor information from the following format: +``` +actor:Daractor:Darliewithrowliewithrowactor:Darliewithrowactor:Darliewithrow +``` + +## Solution + +The `actor_parser.py` script parses the input string by splitting on the `actor:` delimiter, then intelligently filters out corrupted and fragmented actor names to extract only valid actor names. + +### Usage + +Run with default data: +```bash +python3 actor_parser.py +``` + +Run with data from a file: +```bash +python3 actor_parser.py actor_data.txt +``` + +### Output + +``` +Parsed Actors: +1. Darliewithrow + +Total unique actors: 1 +``` + +## Testing + +Run the test suite: + +```bash +python3 test_actor_parser.py +``` + +## Implementation Details + +- The parser splits the input string by `actor:` delimiter +- Identifies and filters out corrupted names with internal repetitions +- Removes fragment names that are prefixes of longer valid names +- Maintains unique actors in order of first appearance +- Returns a list of valid actor names + +## License + +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. +https://app.chime.com/link/qr?u=Darlie-Withrow diff --git a/actor_data.txt b/actor_data.txt new file mode 100644 index 0000000..22d35a4 --- /dev/null +++ b/actor_data.txt @@ -0,0 +1 @@ +actor:Daractor:Darliewithrowliewithrowactor:Darliewithrowactor:Darliewithrow diff --git a/actor_parser.py b/actor_parser.py new file mode 100644 index 0000000..3d7f996 --- /dev/null +++ b/actor_parser.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +""" +Actor Data Parser + +This script parses actor data from a formatted string. +The input format is: actor:actor:... +""" + +import re +import sys +from typing import List, Set + + +def parse_actors(data: str) -> List[str]: + """ + Parse actor names from the input string. + + Args: + data: Input string in format "actor:" where name may be corrupted + + Returns: + List of valid actor names, filtering out corrupted partial names + """ + if not data: + return [] + + # Split by 'actor:' and filter out empty strings + parts = data.split('actor:') + actors = [part.strip() for part in parts if part.strip()] + + # Remove duplicates while preserving order + seen = set() + unique_actors = [] + for actor in actors: + if actor not in seen: + seen.add(actor) + unique_actors.append(actor) + + # First pass: identify corrupted names (those with internal repetition) + corrupted_actors = set() + for actor in unique_actors: + # Check if this is a corrupted name with internal repetition + # For example, "Darliewithrowliewithrow" has "liewithrow" repeated + for other_actor in unique_actors: + if actor != other_actor and actor.startswith(other_actor): + # Check if removing the prefix leaves a suffix that overlaps + suffix = actor[len(other_actor):] + # If the suffix is part of the prefix actor, this is likely corruption + if suffix and suffix in other_actor: + corrupted_actors.add(actor) + break + + # Second pass: filter based on validity + filtered_actors = [] + for actor in unique_actors: + # Skip already identified corrupted names + if actor in corrupted_actors: + continue + + # Check if this is a prefix of another NON-corrupted actor (likely a fragment) + is_prefix_of_valid = False + for other_actor in unique_actors: + if (actor != other_actor and + other_actor.startswith(actor) and + other_actor not in corrupted_actors and + len(actor) <= len(other_actor) / 2): # Only if significantly shorter + # This actor is a prefix of another valid one, likely incomplete + is_prefix_of_valid = True + break + + if not is_prefix_of_valid: + filtered_actors.append(actor) + + return filtered_actors + + +def main(): + # Check if a file path is provided as argument + if len(sys.argv) > 1: + file_path = sys.argv[1] + try: + with open(file_path, 'r') as f: + input_data = f.read().strip() + print(f"Reading from file: {file_path}") + except FileNotFoundError: + print(f"Error: File '{file_path}' not found") + sys.exit(1) + else: + # The problem statement data + input_data = "actor:Daractor:Darliewithrowliewithrowactor:Darliewithrowactor:Darliewithrow" + + print(f"Input data: {input_data}\n") + + actors = parse_actors(input_data) + + print("Parsed Actors:") + for i, actor in enumerate(actors, 1): + print(f"{i}. {actor}") + + print(f"\nTotal unique actors: {len(actors)}") + + +if __name__ == "__main__": + main() diff --git a/test_actor_parser.py b/test_actor_parser.py new file mode 100644 index 0000000..5707e87 --- /dev/null +++ b/test_actor_parser.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +""" +Test suite for Actor Data Parser +""" + +import unittest +from actor_parser import parse_actors + + +class TestActorParser(unittest.TestCase): + """Test cases for the actor parser""" + + def test_simple_actors(self): + """Test parsing simple actor list""" + data = "actor:John actor:Jane actor:Bob" + expected = ["John", "Jane", "Bob"] + self.assertEqual(parse_actors(data), expected) + + def test_duplicate_actors(self): + """Test that duplicate actors are removed""" + data = "actor:Alice actor:Bob actor:Alice" + expected = ["Alice", "Bob"] + self.assertEqual(parse_actors(data), expected) + + def test_empty_string(self): + """Test parsing empty string""" + self.assertEqual(parse_actors(""), []) + + def test_single_actor(self): + """Test parsing single actor""" + data = "actor:SingleActor" + expected = ["SingleActor"] + self.assertEqual(parse_actors(data), expected) + + def test_problem_statement_data(self): + """Test the actual problem statement data""" + data = "actor:Daractor:Darliewithrowliewithrowactor:Darliewithrowactor:Darliewithrow" + result = parse_actors(data) + # Should parse into distinct actors + self.assertIsInstance(result, list) + self.assertGreater(len(result), 0) + # Check that Darliewithrow is in the results + self.assertIn("Darliewithrow", result) + + def test_actors_with_whitespace(self): + """Test parsing actors with whitespace""" + data = "actor: SpaceActor actor:NoSpace " + result = parse_actors(data) + self.assertIn("SpaceActor", result) + self.assertIn("NoSpace", result) + + def test_no_actor_prefix(self): + """Test string without actor prefix""" + data = "JustAName" + expected = ["JustAName"] + self.assertEqual(parse_actors(data), expected) + + +if __name__ == "__main__": + unittest.main()