Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 152 additions & 0 deletions GSoC24_H/models/coref_model/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# =============================================================================
# Before you start changing anything here, read the comments.
# All of them can be found below in the "DEFAULT" section

[DEFAULT]

# The directory that contains extracted files of everything you've downloaded.
data_dir = "coref/model"

# Train, dev and test jsonlines
# train_data = "data/english_train_head.jsonlines"
# dev_data = "data/english_development_head.jsonlines"
# test_data = "data/english_test_head.jsonlines"

train_data = "data/hindi1/hindi_train_head.jsonlines"
dev_data = "data/hindi1/hindi_development_head.jsonlines"
test_data = "data/hindi1/hindi_test_head.jsonlines"

# train_data = "data/all_train_head.jsonlines"
# dev_data = "data/all_development_head.jsonlines"
# test_data = "data/all_test_head.jsonlines"

# The device where everything is to be placed. "cuda:N"/"cpu" are supported.
device = "cpu"


# Bert settings ======================

# Base bert model architecture and tokenizer
bert_model = "bert-large-cased"

# Controls max length of sequences passed through bert to obtain its
# contextual embeddings
# Must be less than or equal to 512
bert_window_size = 512


# General model settings =============

# Controls the dimensionality of feature embeddings
embedding_size = 20

# Controls the dimensionality of distance embeddings used by SpanPredictor
sp_embedding_size = 64

# Controls the number of spans for which anaphoricity can be scores in one
# batch. Only affects final scoring; mention extraction and rough scoring
# are less memory intensive, so they are always done in just one batch.
a_scoring_batch_size = 512

# AnaphoricityScorer FFNN parameters
hidden_size = 1024
n_hidden_layers = 1


# Mention extraction settings ========

# Mention extractor will check spans up to max_span_len words
# The default value is chosen to be big enough to hold any dev data span
max_span_len = 64


# Pruning settings ===================

# Controls how many pairs should be preserved per mention
# after applying rough scoring.
rough_k = 50


# Training settings ==================

# Controls whether to fine-tune bert_model
bert_finetune = true

# Controls the dropout rate throughout all models
dropout_rate = 0.3

# Bert learning rate (only used if bert_finetune is set)
bert_learning_rate = 1e-5

# Task learning rate
learning_rate = 3e-4
# learning_rate = 1e-5

# For how many epochs the training is done
train_epochs = 10

# Controls the weight of binary cross entropy loss added to nlml loss
bce_loss_weight = 0.5

# The directory that will contain conll prediction files
conll_log_dir = "data/conll_logs"

# =============================================================================
# Extra keyword arguments to be passed to bert tokenizers of specified models
[DEFAULT.tokenizer_kwargs]
[DEFAULT.tokenizer_kwargs.roberta-large]
"add_prefix_space" = true

[DEFAULT.tokenizer_kwargs.spanbert-large-cased]
"do_lower_case" = false

[DEFAULT.tokenizer_kwargs.bert-large-cased]
"do_lower_case" = false

[DEFAULT.tokenizer_kwargs.bert-base-multilingual-cased]
"do_lower_case" = false

# =============================================================================
# The sections listed here do not need to make use of all config variables
# If a variable is omitted, its default value will be used instead

# -------------- new ------------
[mbert_cased]
bert_model = "bert-base-multilingual-cased"

[xlmr]
bert_model = "xlm-roberta-base"

[mbert_uncased]
bert_model = "bert-base-multilingual-uncased"
# ------------------------------

[roberta]
bert_model = "roberta-large"

[roberta_no_bce]
bert_model = "roberta-large"
bce_loss_weight = 0.0

[spanbert]
bert_model = "SpanBERT/spanbert-large-cased"

[spanbert_no_bce]
bert_model = "SpanBERT/spanbert-large-cased"
bce_loss_weight = 0.0

[bert]
bert_model = "bert-large-cased"

[longformer]
bert_model = "allenai/longformer-large-4096"
bert_window_size = 2048

[debug]
bert_window_size = 384
bert_finetune = false
device = "cpu:0"

[debug_gpu]
bert_window_size = 384
bert_finetune = false
3 changes: 2 additions & 1 deletion GSoC24_H/models/download_models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ wget "https://dl.fbaipublicfiles.com/GENRE/fairseq_multilingual_entity_disambigu
tar -xzvf fairseq_multilingual_entity_disambiguation.tar.gz -C ./EL_model
wget -P ./EL_model "http://dl.fbaipublicfiles.com/GENRE/titles_lang_all105_marisa_trie_with_redirect.pkl"

# rm fairseq_multilingual_entity_disambiguation.tar.gz # remove the tar.gz file after extraction
# remove the tar.gz files after extraction (run if successful)
# rm wl_coref_transmucores.tar.gz files_indie.tar.gz fairseq_multilingual_entity_disambiguation.tar.gz
2 changes: 2 additions & 0 deletions GSoC24_H/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ requests
nltk
graphviz
fairseq
genre
marisa-trie
sklearn-crfsuite
gdown # for downloading the models from google drive
sentencepiece
2 changes: 1 addition & 1 deletion GSoC24_H/src/chunking/crf_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def predict_with_crf(sent):
)

file = open(
"chunking/state_dicts/model/sklearn_crf_model_v2_pos_mapped_2.pkl", "rb"
"../../models/RE_model/files_indie/sklearn_crf_model_v2_pos_mapped_2.pkl", "rb"
)
crf = pickle.load(file)
file.close()
Expand Down
13 changes: 10 additions & 3 deletions GSoC24_H/src/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def load_coref_model():
coref_model = CorefModel("models/coref_model/config.toml", "xlmr")
coref_model.config.device = device
coref_model.load_weights(
path="models/coref_model/xlmr_multi_plus_hi2.pt",
path="models/coref_model/model/xlmr_multi_plus_hi2.pt",
map_location=device,
ignore={
"bert_optimizer",
Expand All @@ -39,10 +39,17 @@ def load_nlp_model():

@st.cache_resource
def load_el_model():
with open("input/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
# might need to manually make the following change in the fairseq model loading
# depending on the pytorch version being used
# File: `fairseq/fairseq/checkpoint_utils.py`
# Line: 271
# Reason: The model checkpoint is not "weights-only".
# Change to:
# state = torch.load(f, map_location=torch.device("cpu"), weights_only=False)
with open("models/EL_model/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
trie = pickle.load(f)
el_model = mGENRE.from_pretrained(
"models/fairseq_multilingual_entity_disambiguation"
"models/EL_model/fairseq_multilingual_entity_disambiguation"
).eval()
return trie, el_model

Expand Down
13 changes: 10 additions & 3 deletions GSoC24_H/src/entity_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,21 @@

# with open("input/lang_title2wikidataID-normalized_with_redirect.pkl", "rb") as f:
# lang_title2wikidataID = pickle.load(f)
#

with open("input/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:

# might need to manually make the following change in the fairseq model loading
# depending on the pytorch version being used
# File: `fairseq/fairseq/checkpoint_utils.py`
# Line: 271
# Reason: The model checkpoint is not "weights-only".
# Change to:
# state = torch.load(f, map_location=torch.device("cpu"), weights_only=False)
with open("models/EL_model/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
trie = pickle.load(f)

# generate Wikipedia titles and language IDs
model = mGENRE.from_pretrained(
"models/fairseq_multilingual_entity_disambiguation"
"models/EL_model/fairseq_multilingual_entity_disambiguation"
).eval()

sentences = [
Expand Down
4 changes: 2 additions & 2 deletions GSoC24_H/src/indIE.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
}

hyper_params["embedding_size"] = model_embeddings[hyper_params["bert_model_name"]]
my_tagset = torch.load("input/my_tagset_" + hyper_params["notation"] + ".bin")
my_tagset = torch.load("models/RE_model/files_indie/my_tagset_" + hyper_params["notation"] + ".bin")
hyper_params["my_tagset"] = my_tagset

os.environ["PYTHONHASHSEED"] = str(hyper_params["rseed"])
Expand Down Expand Up @@ -80,7 +80,7 @@
print("Creating the XLM chunker model...")
model = chunker_class(device, hyper_params).to(device)
checkpoint = torch.load(
"models/state_dicts/model/" + str(hyper_params["run_ID"]) + "_epoch_4.pth.tar",
"models/RE_model/files_indie/" + str(hyper_params["run_ID"]) + "_epoch_4.pth.tar",
map_location=device,
)
checkpoint["state_dict"].pop("model.embeddings.position_ids")
Expand Down
14 changes: 11 additions & 3 deletions GSoC24_H/src/start.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def replace_mention(match):
coref_model = CorefModel("models/coref_model/config.toml", "xlmr")
coref_model.config.device = device
coref_model.load_weights(
path="models/coref_model/xlmr_multi_plus_hi2.pt",
path="models/coref_model/model/xlmr_multi_plus_hi2.pt",
map_location=device,
ignore={
"bert_optimizer",
Expand All @@ -56,10 +56,18 @@ def replace_mention(match):
},
)
coref_model.training = False
with open("input/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:

# might need to manually make the following change in the fairseq model loading
# depending on the pytorch version being used
# File: `fairseq/fairseq/checkpoint_utils.py`
# Line: 271
# Reason: The model checkpoint is not "weights-only".
# Change to:
# state = torch.load(f, map_location=torch.device("cpu"), weights_only=False)
with open("models/EL_model/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
trie = pickle.load(f)
el_model = mGENRE.from_pretrained(
"models/fairseq_multilingual_entity_disambiguation"
"models/EL_model/fairseq_multilingual_entity_disambiguation"
).eval()
nlp = stanza.Pipeline(lang="hi", processors="tokenize,pos")
# tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
Expand Down
14 changes: 14 additions & 0 deletions GSoC25_H/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
*.pkl
*.pt
*.model
__pycache__/
.ipynb_checkpoints/
.DS_Store
.env
models/EL_model/fairseq_multilingual_entity_disambiguation/
models/RE_model/files_indie/
models/state_dicts/model.pt
models/ontology/
*.tar.gz
llm_IE/finetuning/synthetic_data/
llm_IE/full_dataset_results/
5 changes: 5 additions & 0 deletions GSoC25_H/IndIE/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__pycache__/
venv/
chunking/data/
chunking/state_dicts/model/
.DS_Store
93 changes: 93 additions & 0 deletions GSoC25_H/IndIE/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# IndIE

This is the code for the paper titled "IndIE: A Multilingual Open Information Extraction Tool For Indic Languages" accepted in the Findings of IJCNLP-AACL 2023.

You can check out the live deployment of the IndIE [here](http://103.25.231.59:80).

## Installation

* Git clone.
* Make a new virtual environment.
* Upgrade pip by ```pip install -U pip```
* Install the necessary libraries by using the following command:
```pip install -r requirements.txt```
* If you face any difficulty in installing any library then we recommend to install it without version number. Hence, pip will install from the latest version.
* However, we do not recommend this for stanza library because a different version of stanza library will yield in different dependency parse trees [[source]](https://github.com/stanfordnlp/stanza/issues/990).

## Download Models

Download the relevant files from [here](https://drive.google.com/file/d/1UqOUdeK96m6EabI-cg2EeBz6p3IwrPZ6/view?usp=sharing).

Then place the files in the directories such that your directory structure looks like this:

```
.
├── chunking
│   ├── chunking_model.py
│   ├── crf_chunker.py
│   ├── data
│   │   └── my_tagset_BI.bin
│   └── state_dicts
│   └── model
│   ├── 26_epoch_4.pth.tar
│   └── sklearn_crf_model_v2_pos_mapped_2.pkl
```

## Memory needed

The amount of memory needed varies depending upon the number of languages on which you wish to run your triple extraction.

1) All languages (Hindi/Tamil/Telugu/Urdu)
* GPU present
* ~6GB on CPU/RAM
* ~6GB on GPU
* GPU absent
* ~7GB on CPU/RAM
2) Only one language
* GPU present
* ~3GB on CPU/RAM
* ~4GB on GPU
* GPU absent
* ~3GB on CPU/RAM


## Extracting triples

Specify the language and list of strings in the ```main.py``` file.

On GPU, make sure you have same device order on nvidia-smi and PCI bus. Command: ```export CUDA_DEVICE_ORDER=PCI_BUS_ID```

Run

```CUDA_VISIBLE_DEVICES=0 python main.py```

where ```0``` is your GPU ID. The code also runs in absence of GPU but takes a little longer. In order to run the code only on CPU, simply omit the GPU ID.

## Citation

```Will be updated```


## Workflow for GSoC25_H
1. Update hyperparameters in main.py for running the kind of extractions:

```json
'use_llm': False, # True for running entire MDT output through LLM
'llm_fallback': True, # set to True for running only on sentences which did not get any output from original rule based extractions
'llm_enhancement': True, # set to True for running rule based extraction + llm extraction
'llm_filter_mode': False, # set to True for running llm based filter on the output of all the above strategies
'llm_model': 'gemma3:12b-it-qat'
```


2. Generate extractions using ```python main.py```

3. Convert the *.h5 generated file to tab sepearted extractions using ```python convert.py```

4. Copy the file the IndIE/hindi-benchie/extractions folder and update the code.py file with correct file path and run ```python code.py```.






Loading