-
Notifications
You must be signed in to change notification settings - Fork 5
char_list and char_disamb functions in document.py #164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,9 @@ | |
|
|
||
| from corpus_analysis import common | ||
|
|
||
| # for character identification pipeline | ||
| from corpus_analysis.character import HONORIFICS | ||
|
|
||
|
|
||
| class Document: | ||
| """ | ||
|
|
@@ -637,3 +640,83 @@ def update_metadata(self, new_metadata): | |
| f" not '{new_metadata['date']}'" | ||
| ) from err | ||
| setattr(self, key, new_metadata[key]) | ||
|
|
||
| def get_char_list(self, cutoff_num=10): | ||
| """ | ||
| given a document object, find a list of characters with their frequency in the novels | ||
| param: cutoff_num defaults to 10 for the thredshold for cutoffs based on named frequency | ||
| return: a list of tuples with character names in descending sorted order that occurs | ||
| more than the cutoff_num times in the document | ||
| >>> from corpus_analysis import document | ||
| >>> from pathlib import Path | ||
| >>> from gender_analysis import common | ||
| >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818', | ||
| ... 'filename': 'austen_persuasion.txt', | ||
| ... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels', | ||
| ... 'texts', 'austen_persuasion.txt')} | ||
| >>> persuasion = document.Document(document_metadata) | ||
| >>> persuasion_chars = persuasion.get_char_list(20) | ||
| >>> persuasion_chars | ||
| [('Anne', 425), ('Captain Wentworth', 119), ('Lady Russell', 116), ('Charles', 115), ('Mary', 113), ('Sir Walter', 95), ('Elizabeth', 82), ('Elliot', 76), ('Louisa', 66), ('Henrietta', 65), ('Mrs Musgrove', 40), ('Mrs Smith', 36), ('Mrs Clay', 33), ('Miss Elliot', 32), ('Captain Benwick', 32), ('Wentworth', 31), ('Charles Hayter', 29), ('Mrs Croft', 27), ('Benwick', 26), ('Musgrove', 24), ('Uppercross', 23), ('Lady Dalrymple', 22), ('Captain Harville', 21)] | ||
| >>> len(persuasion_chars) | ||
| 23 | ||
| """ | ||
|
|
||
| labels_char = [] | ||
| labels = 'FACILITY,GPE,GSP,LOCATION,ORGANIZATION,PERSON' | ||
| document = self._load_document_text() | ||
|
fyang3 marked this conversation as resolved.
Outdated
|
||
| sentences = nltk.sent_tokenize(document) | ||
| for sent in sentences: | ||
| for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): | ||
| if hasattr(chunk, 'label'): | ||
| labels_char.append((chunk.label(), ' '.join(c[0] for c in chunk))) | ||
|
fyang3 marked this conversation as resolved.
|
||
| char_dict = {lab: {} for lab in labels.split(',')} | ||
| for ch in labels_char: | ||
| cat = char_dict[ch[0]] | ||
| cat[ch[1]] = cat.get(ch[1], 0) + 1 | ||
|
fyang3 marked this conversation as resolved.
Outdated
|
||
| people = char_dict['PERSON'] | ||
| people_sorted = [(p, people[p]) for p in people if p not in HONORIFICS] | ||
| people_sorted = sorted(people_sorted, key=lambda p: p[1], reverse=True) | ||
| cutoff = len(people_sorted) | ||
| for i in range(len(people_sorted)): | ||
| if people_sorted[i][1] < cutoff_num: | ||
| cutoff = i | ||
| break | ||
| char_list = people_sorted[:cutoff] | ||
|
|
||
| return char_list | ||
|
|
||
| @staticmethod | ||
| def filter_honr(name): | ||
| name = name.split(' ') | ||
| return [n for n in name if n not in HONORIFICS] | ||
|
|
||
| def char_name_disambiguation(self, char_list): | ||
| """given a list of char names in a document, group them by potential nicknames | ||
| :param: a list of character as well as their freq from get_char_list | ||
|
fyang3 marked this conversation as resolved.
Outdated
|
||
| :return: a list of list of character names and freq where the first one is the name, | ||
| followed by nicknames | ||
| >>> from corpus_analysis import document | ||
| >>> from pathlib import Path | ||
| >>> from gender_analysis import common | ||
| >>> document_metadata = {'author': 'Austen, Jane', 'title': 'Persuasion', 'date': '1818', | ||
| ... 'filename': 'austen_persuasion.txt', | ||
| ... 'filepath': Path(common.TEST_DATA_PATH, 'sample_novels', | ||
| ... 'texts', 'austen_persuasion.txt')} | ||
| >>> persuasion = document.Document(document_metadata) | ||
| >>> persuasion_chars = persuasion.get_char_list(20) | ||
| >>> disamb = persuasion.char_name_disambiguation(persuasion_chars) | ||
| >>> disamb | ||
| [[('Anne', 425)], [('Captain Wentworth', 119), ('Captain Benwick', 32), ('Wentworth', 31), ('Captain Harville', 21)], [('Lady Russell', 116)], [('Charles', 115), ('Charles Hayter', 29)], [('Mary', 113)], [('Sir Walter', 95)], [('Elizabeth', 82)], [('Elliot', 76), ('Miss Elliot', 32)], [('Louisa', 66)], [('Henrietta', 65)], [('Mrs Musgrove', 40), ('Musgrove', 24)], [('Mrs Smith', 36)], [('Mrs Clay', 33)], [('Miss Elliot', 32)], [('Captain Benwick', 32), ('Benwick', 26), ('Captain Harville', 21)], [('Wentworth', 31)], [('Charles Hayter', 29)], [('Mrs Croft', 27)], [('Benwick', 26)], [('Musgrove', 24)], [('Uppercross', 23)], [('Lady Dalrymple', 22)]] | ||
| >>> len(disamb) | ||
| 22 | ||
| """ | ||
| to_return = [] | ||
| for i in range(len(char_list) - 1): | ||
| char_cluster = [char_list[i]] | ||
| for j in range(i + 1, len(char_list)): | ||
| if set(self.filter_honr(char_list[i][0])).intersection( | ||
| set(self.filter_honr(char_list[j][0]))): | ||
| char_cluster.append(char_list[j]) | ||
| to_return.append(char_cluster) | ||
| return to_return | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As we talked about in Slack briefly, this method probably requires some thinking through. If I'm reading the test output in 710 correctly, it looks like the disambiguation is overly generous, and I suspect we can figure out a more optimized way to traverse those character lists. Let's chat through some issues in office hours. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,11 +3,9 @@ | |
| 'dunning', | ||
| 'gender_frequency', | ||
| 'instance_distance', | ||
| 'proximity', | ||
| ] | ||
|
|
||
| from gender_analysis.analysis.dependency_parsing import * | ||
| from gender_analysis.analysis.dunning import * | ||
| from gender_analysis.analysis.gender_frequency import * | ||
| from gender_analysis.analysis.instance_distance import * | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A note for posterity. This is a temporary measure to prevent circular imports caused by |
||
| from gender_analysis.analysis.proximity import * | ||
Uh oh!
There was an error while loading. Please reload this page.