Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions corpus_analysis/character.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
from gender_analysis.pronouns import PronounSeries
from gender_analysis.gender import Gender

FEMALE_HONORIFICS = ["Miss", "Mrs", "Ms", "Mistress", "Madam", "Ma'am", "Dame",
"Lady", "Her Honour", "Her Honor", "My Lady", "Your Ladyship",
"Sr", "Sister", "Sayyidah"]
MALE_HONORIFICS = ["Master", "Mr", "Sir", "Gentleman", "Sire", "Lord", "His Honour",
"His Honor", "My Lord", "Your Lordship", "Master", "Esquire", "Esq",
"His Holiness", "Pope", "His All Holiness", "His Beatitude", "The Reverend",
"Rev", "Fr", "Father", "Pr", "Pastor", "Br", "Brother", "Rabbi", "Imam",
"Mufti", "Sayyid"]
NEUTRAL_HONORIFICS = ["Mx", "Excellency", "Excellence", "Your Honor", "The Honorable",
"The Honourable", "The Hon", "Hon", "The Hon'ble", "The Right Honourable",
"The Most Honourable", "Dr", "Doctor", "Professor", "QC", "Cl", "S Cl",
"Counsel", "Senior Counsel", "Eur Ing", "Vice-Chancellor", "Principal",
"President", "Warden", "Dean", "Regent", "Rector", "Provost", "Director",
"Chief Executive", "Venerable", "Eminent"]
HONORIFICS = FEMALE_HONORIFICS + MALE_HONORIFICS + NEUTRAL_HONORIFICS

class Character:
"""
Expand Down
83 changes: 83 additions & 0 deletions corpus_analysis/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

from corpus_analysis import common

# for character identification pipeline
from corpus_analysis.character import HONORIFICS


class Document:
"""
Expand Down Expand Up @@ -637,3 +640,83 @@ def update_metadata(self, new_metadata):
f" not '{new_metadata['date']}'"
) from err
setattr(self, key, new_metadata[key])

def get_char_list(self, cutoff_num=10):
"""
given a document object, find a list of characters with their frequency in the novels
param: cutoff_num defaults to 10 for the thredshold for cutoffs based on named frequency
return: a list of tuples with character names in descending sorted order that occurs
Comment thread
fyang3 marked this conversation as resolved.
Outdated
more than the cutoff_num times in the document
>>> from corpus_analysis import document
>>> from pathlib import Path
>>> from gender_analysis import common
>>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818',
... 'filename': 'austen_persuasion.txt',
... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels',
... 'texts', 'austen_persuasion.txt')}
>>> persuasion = document.Document(document_metadata)
>>> persuasion_chars = persuasion.get_char_list(20)
>>> persuasion_chars
[('Anne', 425), ('Captain Wentworth', 119), ('Lady Russell', 116), ('Charles', 115), ('Mary', 113), ('Sir Walter', 95), ('Elizabeth', 82), ('Elliot', 76), ('Louisa', 66), ('Henrietta', 65), ('Mrs Musgrove', 40), ('Mrs Smith', 36), ('Mrs Clay', 33), ('Miss Elliot', 32), ('Captain Benwick', 32), ('Wentworth', 31), ('Charles Hayter', 29), ('Mrs Croft', 27), ('Benwick', 26), ('Musgrove', 24), ('Uppercross', 23), ('Lady Dalrymple', 22), ('Captain Harville', 21)]
>>> len(persuasion_chars)
23
"""

labels_char = []
labels = 'FACILITY,GPE,GSP,LOCATION,ORGANIZATION,PERSON'
document = self._load_document_text()
Comment thread
fyang3 marked this conversation as resolved.
Outdated
sentences = nltk.sent_tokenize(document)
for sent in sentences:
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label'):
labels_char.append((chunk.label(), ' '.join(c[0] for c in chunk)))
Comment thread
fyang3 marked this conversation as resolved.
char_dict = {lab: {} for lab in labels.split(',')}
for ch in labels_char:
cat = char_dict[ch[0]]
cat[ch[1]] = cat.get(ch[1], 0) + 1
Comment thread
fyang3 marked this conversation as resolved.
Outdated
people = char_dict['PERSON']
people_sorted = [(p, people[p]) for p in people if p not in HONORIFICS]
people_sorted = sorted(people_sorted, key=lambda p: p[1], reverse=True)
cutoff = len(people_sorted)
for i in range(len(people_sorted)):
if people_sorted[i][1] < cutoff_num:
cutoff = i
break
char_list = people_sorted[:cutoff]

return char_list

@staticmethod
def filter_honr(name):
name = name.split(' ')
return [n for n in name if n not in HONORIFICS]

def char_name_disambiguation(self, char_list):
"""given a list of char names in a document, group them by potential nicknames
:param: a list of character as well as their freq from get_char_list
Comment thread
fyang3 marked this conversation as resolved.
Outdated
:return: a list of list of character names and freq where the first one is the name,
followed by nicknames
>>> from corpus_analysis import document
>>> from pathlib import Path
>>> from gender_analysis import common
>>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818',
... 'filename': 'austen_persuasion.txt',
... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels',
... 'texts', 'austen_persuasion.txt')}
>>> persuasion = document.Document(document_metadata)
>>> persuasion_chars = persuasion.get_char_list(20)
>>> disamb = persuasion.char_name_disambiguation(persuasion_chars)
>>> disamb
[[('Anne', 425)], [('Captain Wentworth', 119), ('Captain Benwick', 32), ('Wentworth', 31), ('Captain Harville', 21)], [('Lady Russell', 116)], [('Charles', 115), ('Charles Hayter', 29)], [('Mary', 113)], [('Sir Walter', 95)], [('Elizabeth', 82)], [('Elliot', 76), ('Miss Elliot', 32)], [('Louisa', 66)], [('Henrietta', 65)], [('Mrs Musgrove', 40), ('Musgrove', 24)], [('Mrs Smith', 36)], [('Mrs Clay', 33)], [('Miss Elliot', 32)], [('Captain Benwick', 32), ('Benwick', 26), ('Captain Harville', 21)], [('Wentworth', 31)], [('Charles Hayter', 29)], [('Mrs Croft', 27)], [('Benwick', 26)], [('Musgrove', 24)], [('Uppercross', 23)], [('Lady Dalrymple', 22)]]
>>> len(disamb)
22
"""
to_return = []
for i in range(len(char_list) - 1):
char_cluster = [char_list[i]]
for j in range(i + 1, len(char_list)):
if set(self.filter_honr(char_list[i][0])).intersection(
set(self.filter_honr(char_list[j][0]))):
char_cluster.append(char_list[j])
to_return.append(char_cluster)
return to_return
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we talked about in Slack briefly, this method probably requires some thinking through. If I'm reading the test output in 710 correctly, it looks like the disambiguation is overly generous, and I suspect we can figure out a more optimized way to traverse those character lists. Let's chat through some issues in office hours.

2 changes: 0 additions & 2 deletions gender_analysis/analysis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
'dunning',
'gender_frequency',
'instance_distance',
'proximity',
]

from gender_analysis.analysis.dependency_parsing import *
from gender_analysis.analysis.dunning import *
from gender_analysis.analysis.gender_frequency import *
from gender_analysis.analysis.instance_distance import *
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A note for posterity. This is a temporary measure to prevent circular imports caused by proximity.py importing Corpus for type hinting. PR #163 attempts to address the issue more fundamentally.

from gender_analysis.analysis.proximity import *
16 changes: 8 additions & 8 deletions gender_analysis/analysis/proximity.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from more_itertools import windowed
import nltk

from corpus_analysis.corpus import Corpus
#from corpus_analysis.corpus import Corpus
from corpus_analysis.document import Document
from corpus_analysis.common import load_pickle, store_pickle, NLTK_TAGS, NLTK_TAGS_ADJECTIVES

Expand Down Expand Up @@ -79,7 +79,7 @@ def _diff_gender_token_counters(gender_token_counters: GenderTokenCounters,
return difference_dict


def _generate_token_counter(document: Document,
def _generate_token_counter(document,
gender_to_find: Gender,
word_window: int,
tags: Sequence[str],
Expand Down Expand Up @@ -139,7 +139,7 @@ def _generate_token_counter(document: Document,
return output


def _generate_gender_token_counters(document: Document,
def _generate_gender_token_counters(document,
genders: Sequence[Gender],
tags: Sequence[str],
word_window: int) -> GenderTokenCounters:
Expand Down Expand Up @@ -278,7 +278,7 @@ def _sort_token_counter(token_counter: Counter,
return output_token_counter.most_common(limit)


def find_in_document_gender(document: Document,
def find_in_document_gender(document,
gender: Gender,
tags: Sequence[str] = None,
word_window: int = 5,
Expand Down Expand Up @@ -313,7 +313,7 @@ def find_in_document_gender(document: Document,
genders_to_exclude=genders_to_exclude)


def find_in_document_female(document: Document,
def find_in_document_female(document,
tags: Sequence[str] = None,
word_window: int = 5) -> Counter:
"""
Expand Down Expand Up @@ -391,7 +391,7 @@ class GenderProximityAnalyzer:
"""

def __init__(self,
texts: Union[Document, Corpus, Sequence[Document]],
texts,
tags: Optional[Sequence[str]] = None,
genders: Optional[Sequence[Gender]] = None,
word_window: int = 5) -> None:
Expand All @@ -412,8 +412,8 @@ def __init__(self,
if tags is None:
tags = NLTK_TAGS_ADJECTIVES

if isinstance(texts, Corpus):
documents = texts.documents
#if isinstance(texts, Corpus):
#documents = texts.documents
Comment thread
fyang3 marked this conversation as resolved.
Outdated
elif isinstance(texts, Document):
documents = [texts]
elif isinstance(texts, list):
Expand Down