diff --git a/linkedin2username.py b/linkedin2username.py index 4eaf507..55cf1d0 100755 --- a/linkedin2username.py +++ b/linkedin2username.py @@ -149,8 +149,8 @@ def split_name(name): Some people have funny names. We assume the most important names are: first name, last name, and the name right before the last name (if they have one) """ - # Split on spaces and dashes (included repeated) - parsed = re.split(r'[\s-]+', name) + # Split on whitespace only; hyphens are part of compound names (e.g. "Jean-Charles") + parsed = re.split(r'\s+', name) # Iterate and remove empty strings parsed = [part for part in parsed if part] @@ -170,61 +170,71 @@ def split_name(name): return split_name + @staticmethod + def _hyphen_variants(name_part): + """Return the full part plus each sub-part if hyphenated. + + Handles cases like 'davidson-smith' -> ['davidson-smith', 'davidson', 'smith'] + so callers can generate usernames for both the compound form and each component. + """ + if '-' in name_part: + return [name_part] + name_part.split('-') + return [name_part] + def f_last(self): """jsmith""" names = set() - names.add(self.name['first'][0] + self.name['last']) - + for last in self._hyphen_variants(self.name['last']): + names.add(self.name['first'][0] + last) if self.name['second']: - names.add(self.name['first'][0] + self.name['second']) - + for second in self._hyphen_variants(self.name['second']): + names.add(self.name['first'][0] + second) return names def f_dot_last(self): """j.smith""" names = set() - names.add(self.name['first'][0] + '.' + self.name['last']) - + for last in self._hyphen_variants(self.name['last']): + names.add(self.name['first'][0] + '.' + last) if self.name['second']: - names.add(self.name['first'][0] + '.' + self.name['second']) - + for second in self._hyphen_variants(self.name['second']): + names.add(self.name['first'][0] + '.' + second) return names def last_f(self): """smithj""" names = set() - names.add(self.name['last'] + self.name['first'][0]) - + for last in self._hyphen_variants(self.name['last']): + names.add(last + self.name['first'][0]) if self.name['second']: - names.add(self.name['second'] + self.name['first'][0]) - + for second in self._hyphen_variants(self.name['second']): + names.add(second + self.name['first'][0]) return names def first_dot_last(self): """john.smith""" names = set() - names.add(self.name['first'] + '.' + self.name['last']) - + for last in self._hyphen_variants(self.name['last']): + names.add(self.name['first'] + '.' + last) if self.name['second']: - names.add(self.name['first'] + '.' + self.name['second']) - + for second in self._hyphen_variants(self.name['second']): + names.add(self.name['first'] + '.' + second) return names def first_l(self): """johns""" names = set() - names.add(self.name['first'] + self.name['last'][0]) - + for last in self._hyphen_variants(self.name['last']): + names.add(self.name['first'] + last[0]) if self.name['second']: - names.add(self.name['first'] + self.name['second'][0]) - + for second in self._hyphen_variants(self.name['second']): + names.add(self.name['first'] + second[0]) return names def first(self): """john""" names = set() names.add(self.name['first']) - return names diff --git a/pyproject.toml b/pyproject.toml index c94e621..32fe3fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "linkedin2username" -version = "0.29" +version = "0.30" requires-python = ">=3.10" dependencies = [ "requests>=2.34.2", diff --git a/tests/test_linkedin2username.py b/tests/test_linkedin2username.py index 0312d16..93c07c7 100644 --- a/tests/test_linkedin2username.py +++ b/tests/test_linkedin2username.py @@ -8,140 +8,206 @@ 2: "John Davidson-Smith", 3: "John-Paul Smith-Robinson", 4: "José Gonzáles", - 5: "🙂 Emoji Folks 🙂" + 5: "🙂 Emoji Folks 🙂", + 6: "Jean-Charles Martin", + 7: "Madonna Wayne Gacey", # 3-word name: exercises the 'second' field in mutations } def test_f_last(): name = TEST_NAMES[1] mutator = NameMutator(name) - assert mutator.f_last() == set(["jsmith", ]) + assert mutator.f_last() == set(["jsmith"]) + # Hyphenated last: compound form + each part name = TEST_NAMES[2] mutator = NameMutator(name) - assert mutator.f_last() == set(["jsmith", "jdavidson"]) + assert mutator.f_last() == set(["jdavidson-smith", "jdavidson", "jsmith"]) + # Hyphenated first and last: compound last + each last part; first stays compound name = TEST_NAMES[3] mutator = NameMutator(name) - assert mutator.f_last() == set(["jsmith", "jrobinson"]) + assert mutator.f_last() == set(["jsmith-robinson", "jsmith", "jrobinson"]) name = TEST_NAMES[4] mutator = NameMutator(name) - assert mutator.f_last() == set(["jgonzales", ]) + assert mutator.f_last() == set(["jgonzales"]) name = TEST_NAMES[5] mutator = NameMutator(name) - assert mutator.f_last() == set(["efolks", ]) + assert mutator.f_last() == set(["efolks"]) + + # Compound hyphenated first name: jean-charles.martin must be generated (issue #82) + name = TEST_NAMES[6] + mutator = NameMutator(name) + assert mutator.f_last() == set(["jmartin"]) + + # 3-word name: second field produces an extra variant using the middle name + name = TEST_NAMES[7] + mutator = NameMutator(name) + assert mutator.f_last() == set(["mgacey", "mwayne"]) def test_f_dot_last(): name = TEST_NAMES[1] mutator = NameMutator(name) - assert mutator.f_dot_last() == set(["j.smith", ]) + assert mutator.f_dot_last() == set(["j.smith"]) name = TEST_NAMES[2] mutator = NameMutator(name) - assert mutator.f_dot_last() == set(["j.smith", "j.davidson"]) + assert mutator.f_dot_last() == set(["j.davidson-smith", "j.davidson", "j.smith"]) name = TEST_NAMES[3] mutator = NameMutator(name) - assert mutator.f_dot_last() == set(["j.smith", "j.robinson"]) + assert mutator.f_dot_last() == set(["j.smith-robinson", "j.smith", "j.robinson"]) name = TEST_NAMES[4] mutator = NameMutator(name) - assert mutator.f_dot_last() == set(["j.gonzales", ]) + assert mutator.f_dot_last() == set(["j.gonzales"]) name = TEST_NAMES[5] mutator = NameMutator(name) - assert mutator.f_dot_last() == set(["e.folks", ]) + assert mutator.f_dot_last() == set(["e.folks"]) + + name = TEST_NAMES[6] + mutator = NameMutator(name) + assert mutator.f_dot_last() == set(["j.martin"]) + + name = TEST_NAMES[7] + mutator = NameMutator(name) + assert mutator.f_dot_last() == set(["m.gacey", "m.wayne"]) def test_last_f(): name = TEST_NAMES[1] mutator = NameMutator(name) - assert mutator.last_f() == set(["smithj", ]) + assert mutator.last_f() == set(["smithj"]) name = TEST_NAMES[2] mutator = NameMutator(name) - assert mutator.last_f() == set(["smithj", "davidsonj"]) + assert mutator.last_f() == set(["davidson-smithj", "davidsonj", "smithj"]) name = TEST_NAMES[3] mutator = NameMutator(name) - assert mutator.last_f() == set(["smithj", "robinsonj"]) + assert mutator.last_f() == set(["smith-robinsonj", "smithj", "robinsonj"]) name = TEST_NAMES[4] mutator = NameMutator(name) - assert mutator.last_f() == set(["gonzalesj", ]) + assert mutator.last_f() == set(["gonzalesj"]) name = TEST_NAMES[5] mutator = NameMutator(name) - assert mutator.last_f() == set(["folkse", ]) + assert mutator.last_f() == set(["folkse"]) + + name = TEST_NAMES[6] + mutator = NameMutator(name) + assert mutator.last_f() == set(["martinj"]) + + name = TEST_NAMES[7] + mutator = NameMutator(name) + assert mutator.last_f() == set(["gaceym", "waynem"]) def test_first_dot_last(): name = TEST_NAMES[1] mutator = NameMutator(name) - assert mutator.first_dot_last() == set(["john.smith", ]) + assert mutator.first_dot_last() == set(["john.smith"]) name = TEST_NAMES[2] mutator = NameMutator(name) - assert mutator.first_dot_last() == set(["john.smith", "john.davidson"]) + assert mutator.first_dot_last() == set(["john.davidson-smith", "john.davidson", "john.smith"]) + # Compound first name is preserved intact; last name variants are expanded name = TEST_NAMES[3] mutator = NameMutator(name) - assert mutator.first_dot_last() == set(["john.smith", "john.robinson"]) + assert mutator.first_dot_last() == set(["john-paul.smith-robinson", "john-paul.smith", "john-paul.robinson"]) name = TEST_NAMES[4] mutator = NameMutator(name) - assert mutator.first_dot_last() == set(["jose.gonzales", ]) + assert mutator.first_dot_last() == set(["jose.gonzales"]) name = TEST_NAMES[5] mutator = NameMutator(name) - assert mutator.first_dot_last() == set(["emoji.folks", ]) + assert mutator.first_dot_last() == set(["emoji.folks"]) + + # The core fix for issue #82: compound first name generates the correct username + name = TEST_NAMES[6] + mutator = NameMutator(name) + assert mutator.first_dot_last() == set(["jean-charles.martin"]) + + name = TEST_NAMES[7] + mutator = NameMutator(name) + assert mutator.first_dot_last() == set(["madonna.gacey", "madonna.wayne"]) def test_first_l(): name = TEST_NAMES[1] mutator = NameMutator(name) - assert mutator.first_l() == set(["johns", ]) + assert mutator.first_l() == set(["johns"]) + # davidson-smith[0]='d', davidson[0]='d' (dup), smith[0]='s' name = TEST_NAMES[2] mutator = NameMutator(name) - assert mutator.first_l() == set(["johns", "johnd"]) + assert mutator.first_l() == set(["johnd", "johns"]) + # smith-robinson[0]='s', smith[0]='s' (dup), robinson[0]='r' name = TEST_NAMES[3] mutator = NameMutator(name) - assert mutator.first_l() == set(["johns", "johnr"]) + assert mutator.first_l() == set(["john-pauls", "john-paulr"]) name = TEST_NAMES[4] mutator = NameMutator(name) - assert mutator.first_l() == set(["joseg", ]) + assert mutator.first_l() == set(["joseg"]) name = TEST_NAMES[5] mutator = NameMutator(name) - assert mutator.first_l() == set(["emojif", ]) + assert mutator.first_l() == set(["emojif"]) + + name = TEST_NAMES[6] + mutator = NameMutator(name) + assert mutator.first_l() == set(["jean-charlesm"]) + + name = TEST_NAMES[7] + mutator = NameMutator(name) + assert mutator.first_l() == set(["madonnag", "madonnaw"]) def test_first(): name = TEST_NAMES[1] mutator = NameMutator(name) - assert mutator.first() == set(["john", ]) + assert mutator.first() == set(["john"]) name = TEST_NAMES[2] mutator = NameMutator(name) - assert mutator.first() == set(["john", ]) + assert mutator.first() == set(["john"]) + # Compound first name is preserved intact name = TEST_NAMES[3] mutator = NameMutator(name) - assert mutator.first() == set(["john", ]) + assert mutator.first() == set(["john-paul"]) name = TEST_NAMES[4] mutator = NameMutator(name) - assert mutator.first() == set(["jose", ]) + assert mutator.first() == set(["jose"]) name = TEST_NAMES[5] mutator = NameMutator(name) - assert mutator.first() == set(["emoji", ]) + assert mutator.first() == set(["emoji"]) + + name = TEST_NAMES[6] + mutator = NameMutator(name) + assert mutator.first() == set(["jean-charles"]) + + # 3-word name: first() always returns only the first token regardless of second/last + name = TEST_NAMES[7] + mutator = NameMutator(name) + assert mutator.first() == set(["madonna"]) + + +def test_hyphen_variants(): + assert NameMutator._hyphen_variants("smith") == ["smith"] + assert NameMutator._hyphen_variants("davidson-smith") == ["davidson-smith", "davidson", "smith"] + assert NameMutator._hyphen_variants("a-b-c") == ["a-b-c", "a", "b", "c"] def test_clean_name(): @@ -170,6 +236,16 @@ def test_split_name(): name = "brian warner is marilyn manson" assert mutator.split_name(name) == {"first": "brian", "second": "marilyn", "last": "manson"} + # Hyphens within a name segment are preserved (not treated as word separators) + name = "jean-charles martin" + assert mutator.split_name(name) == {"first": "jean-charles", "second": "", "last": "martin"} + + name = "john davidson-smith" + assert mutator.split_name(name) == {"first": "john", "second": "", "last": "davidson-smith"} + + name = "john-paul smith-robinson" + assert mutator.split_name(name) == {"first": "john-paul", "second": "", "last": "smith-robinson"} + def test_find_employees(): with open("tests/mock-employee-response", "r") as infile: