dhmit · fyang3 · May 18, 2021 · May 21, 2021 · Jun 15, 2021 · Jun 16, 2021
diff --git a/corpus_analysis/character.py b/corpus_analysis/character.py
@@ -1,6 +1,21 @@
 from gender_analysis.pronouns import PronounSeries
 from gender_analysis.gender import Gender
 
+FEMALE_HONORIFICS = ["Miss", "Mrs", "Ms", "Mistress", "Madam", "Ma'am", "Dame",
+                     "Lady", "Her Honour", "Her Honor", "My Lady", "Your Ladyship",
+                     "Sr", "Sister", "Sayyidah"]
+MALE_HONORIFICS = ["Master", "Mr", "Sir", "Gentleman", "Sire", "Lord", "His Honour",
+                   "His Honor", "My Lord", "Your Lordship", "Master", "Esquire", "Esq",
+                   "His Holiness", "Pope", "His All Holiness", "His Beatitude", "The Reverend",
+                   "Rev", "Fr", "Father", "Pr", "Pastor", "Br", "Brother", "Rabbi", "Imam",
+                   "Mufti", "Sayyid"]
+NEUTRAL_HONORIFICS = ["Mx", "Excellency", "Excellence", "Your Honor", "The Honorable",
+                      "The Honourable", "The Hon", "Hon", "The Hon'ble", "The Right Honourable",
+                      "The Most Honourable", "Dr", "Doctor", "Professor", "QC", "Cl", "S Cl",
+                      "Counsel", "Senior Counsel", "Eur Ing", "Vice-Chancellor", "Principal",
+                      "President", "Warden", "Dean", "Regent", "Rector", "Provost", "Director",
+                      "Chief Executive", "Venerable", "Eminent"]
+HONORIFICS = FEMALE_HONORIFICS + MALE_HONORIFICS + NEUTRAL_HONORIFICS
 
 class Character:
     """

diff --git a/corpus_analysis/document.py b/corpus_analysis/document.py
@@ -9,6 +9,9 @@
 
 from corpus_analysis import common
 
+# for character identification pipeline
+from corpus_analysis.character import HONORIFICS
+
 
 class Document:
     """
@@ -637,3 +640,83 @@ def update_metadata(self, new_metadata):
                         f" not '{new_metadata['date']}'"
                     ) from err
             setattr(self, key, new_metadata[key])
+
+    def get_char_list(self, cutoff_num=10):
+        """
+        given a document object, find a list of characters with their frequency in the novels
+        param: cutoff_num defaults to 10 for the thredshold for cutoffs based on named frequency
+        return: a list of tuples with character names in descending sorted order that occurs
+        more than the cutoff_num times in the document
+        >>> from corpus_analysis import document
+        >>> from pathlib import Path
+        >>> from gender_analysis import common
+        >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818',
+        ...                      'filename': 'austen_persuasion.txt',
+        ...                      'filepath': Path(common.TEST_DATA_PATH, 'sample_novels',
+        ...                                       'texts', 'austen_persuasion.txt')}
+        >>> persuasion = document.Document(document_metadata)
+        >>> persuasion_chars = persuasion.get_char_list(20)
+        >>> persuasion_chars
+        [('Anne', 425), ('Captain Wentworth', 119), ('Lady Russell', 116), ('Charles', 115), ('Mary', 113), ('Sir Walter', 95), ('Elizabeth', 82), ('Elliot', 76), ('Louisa', 66), ('Henrietta', 65), ('Mrs Musgrove', 40), ('Mrs Smith', 36), ('Mrs Clay', 33), ('Miss Elliot', 32), ('Captain Benwick', 32), ('Wentworth', 31), ('Charles Hayter', 29), ('Mrs Croft', 27), ('Benwick', 26), ('Musgrove', 24), ('Uppercross', 23), ('Lady Dalrymple', 22), ('Captain Harville', 21)]
+        >>> len(persuasion_chars)
+        23
+        """
+
+        labels_char = []
+        labels = 'FACILITY,GPE,GSP,LOCATION,ORGANIZATION,PERSON'
+        document = self._load_document_text()
+        sentences = nltk.sent_tokenize(document)
+        for sent in sentences:
+            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
+                if hasattr(chunk, 'label'):
+                    labels_char.append((chunk.label(), ' '.join(c[0] for c in chunk)))
+        char_dict = {lab: {} for lab in labels.split(',')}
+        for ch in labels_char:
+            cat = char_dict[ch[0]]
+            cat[ch[1]] = cat.get(ch[1], 0) + 1
+        people = char_dict['PERSON']
+        people_sorted = [(p, people[p]) for p in people if p not in HONORIFICS]
+        people_sorted = sorted(people_sorted, key=lambda p: p[1], reverse=True)
+        cutoff = len(people_sorted)
+        for i in range(len(people_sorted)):
+            if people_sorted[i][1] < cutoff_num:
+                cutoff = i
+                break
+        char_list = people_sorted[:cutoff]
+
+        return char_list
+
+    @staticmethod
+    def filter_honr(name):
+        name = name.split(' ')
+        return [n for n in name if n not in HONORIFICS]
+
+    def char_name_disambiguation(self, char_list):
+        """given a list of char names in a document, group them by potential nicknames
+        :param: a list of character as well as their freq from get_char_list
+        :return: a list of list of character names and freq where the first one is the name,
+        followed by nicknames
+        >>> from corpus_analysis import document
+        >>> from pathlib import Path
+        >>> from gender_analysis import common
+        >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818',
+        ...                      'filename': 'austen_persuasion.txt',
+        ...                      'filepath': Path(common.TEST_DATA_PATH, 'sample_novels',
+        ...                                       'texts', 'austen_persuasion.txt')}
+        >>> persuasion = document.Document(document_metadata)
+        >>> persuasion_chars = persuasion.get_char_list(20)
+        >>> disamb = persuasion.char_name_disambiguation(persuasion_chars)
+        >>> disamb
+        [[('Anne', 425)], [('Captain Wentworth', 119), ('Captain Benwick', 32), ('Wentworth', 31), ('Captain Harville', 21)], [('Lady Russell', 116)], [('Charles', 115), ('Charles Hayter', 29)], [('Mary', 113)], [('Sir Walter', 95)], [('Elizabeth', 82)], [('Elliot', 76), ('Miss Elliot', 32)], [('Louisa', 66)], [('Henrietta', 65)], [('Mrs Musgrove', 40), ('Musgrove', 24)], [('Mrs Smith', 36)], [('Mrs Clay', 33)], [('Miss Elliot', 32)], [('Captain Benwick', 32), ('Benwick', 26), ('Captain Harville', 21)], [('Wentworth', 31)], [('Charles Hayter', 29)], [('Mrs Croft', 27)], [('Benwick', 26)], [('Musgrove', 24)], [('Uppercross', 23)], [('Lady Dalrymple', 22)]]
+        >>> len(disamb)
+        22
+        """
+        to_return = []
+        for i in range(len(char_list) - 1):
+            char_cluster = [char_list[i]]
+            for j in range(i + 1, len(char_list)):
+                if set(self.filter_honr(char_list[i][0])).intersection(
+                set(self.filter_honr(char_list[j][0]))):
+                    char_cluster.append(char_list[j])
+            to_return.append(char_cluster)
+        return to_return
diff --git a/gender_analysis/analysis/__init__.py b/gender_analysis/analysis/__init__.py
@@ -3,11 +3,9 @@
     'dunning',
     'gender_frequency',
     'instance_distance',
-    'proximity',
 ]
 
 from gender_analysis.analysis.dependency_parsing import *
 from gender_analysis.analysis.dunning import *
 from gender_analysis.analysis.gender_frequency import *
 from gender_analysis.analysis.instance_distance import *
-from gender_analysis.analysis.proximity import *
diff --git a/gender_analysis/analysis/proximity.py b/gender_analysis/analysis/proximity.py
@@ -4,7 +4,7 @@
 from more_itertools import windowed
 import nltk
 
-from corpus_analysis.corpus import Corpus
+#from corpus_analysis.corpus import Corpus
 from corpus_analysis.document import Document
 from corpus_analysis.common import load_pickle, store_pickle, NLTK_TAGS, NLTK_TAGS_ADJECTIVES
 
@@ -79,7 +79,7 @@ def _diff_gender_token_counters(gender_token_counters: GenderTokenCounters,
         return difference_dict
 
 
-def _generate_token_counter(document: Document,
+def _generate_token_counter(document,
                             gender_to_find: Gender,
                             word_window: int,
                             tags: Sequence[str],
@@ -139,7 +139,7 @@ def _generate_token_counter(document: Document,
     return output
 
 
-def _generate_gender_token_counters(document: Document,
+def _generate_gender_token_counters(document,
                                     genders: Sequence[Gender],
                                     tags: Sequence[str],
                                     word_window: int) -> GenderTokenCounters:
@@ -278,7 +278,7 @@ def _sort_token_counter(token_counter: Counter,
     return output_token_counter.most_common(limit)
 
 
-def find_in_document_gender(document: Document,
+def find_in_document_gender(document,
                             gender: Gender,
                             tags: Sequence[str] = None,
                             word_window: int = 5,
@@ -313,7 +313,7 @@ def find_in_document_gender(document: Document,
                                    genders_to_exclude=genders_to_exclude)
 
 
-def find_in_document_female(document: Document,
+def find_in_document_female(document,
                             tags: Sequence[str] = None,
                             word_window: int = 5) -> Counter:
     """
@@ -391,7 +391,7 @@ class GenderProximityAnalyzer:
     """
 
     def __init__(self,
-                 texts: Union[Document, Corpus, Sequence[Document]],
+                 texts,
                  tags: Optional[Sequence[str]] = None,
                  genders: Optional[Sequence[Gender]] = None,
                  word_window: int = 5) -> None:
@@ -412,8 +412,8 @@ def __init__(self,
         if tags is None:
             tags = NLTK_TAGS_ADJECTIVES
 
-        if isinstance(texts, Corpus):
-            documents = texts.documents
+        #if isinstance(texts, Corpus):
+            #documents = texts.documents
         elif isinstance(texts, Document):
             documents = [texts]
         elif isinstance(texts, list):