dbpedia · mommi84 · Jan 24, 2026 · May 29, 2025 · Jun 8, 2025 · Jun 8, 2025
diff --git a/GSoC24_H/models/coref_model/config.toml b/GSoC24_H/models/coref_model/config.toml
@@ -0,0 +1,152 @@
+# =============================================================================
+# Before you start changing anything here, read the comments.
+# All of them can be found below in the "DEFAULT" section
+
+[DEFAULT]
+
+# The directory that contains extracted files of everything you've downloaded.
+data_dir = "coref/model"
+
+# Train, dev and test jsonlines
+# train_data = "data/english_train_head.jsonlines"
+# dev_data = "data/english_development_head.jsonlines"
+# test_data = "data/english_test_head.jsonlines"
+
+train_data = "data/hindi1/hindi_train_head.jsonlines"
+dev_data = "data/hindi1/hindi_development_head.jsonlines"
+test_data = "data/hindi1/hindi_test_head.jsonlines"
+
+# train_data = "data/all_train_head.jsonlines"
+# dev_data = "data/all_development_head.jsonlines"
+# test_data = "data/all_test_head.jsonlines"
+
+# The device where everything is to be placed. "cuda:N"/"cpu" are supported.
+device = "cpu"
+
+
+# Bert settings ======================
+
+# Base bert model architecture and tokenizer
+bert_model = "bert-large-cased"
+
+# Controls max length of sequences passed through bert to obtain its
+# contextual embeddings
+# Must be less than or equal to 512
+bert_window_size = 512
+
+
+# General model settings =============
+
+# Controls the dimensionality of feature embeddings
+embedding_size = 20
+
+# Controls the dimensionality of distance embeddings used by SpanPredictor
+sp_embedding_size = 64
+
+# Controls the number of spans for which anaphoricity can be scores in one
+# batch. Only affects final scoring; mention extraction and rough scoring
+# are less memory intensive, so they are always done in just one batch.
+a_scoring_batch_size = 512
+
+# AnaphoricityScorer FFNN parameters
+hidden_size = 1024
+n_hidden_layers = 1
+
+
+# Mention extraction settings ========
+
+# Mention extractor will check spans up to max_span_len words
+# The default value is chosen to be big enough to hold any dev data span
+max_span_len = 64
+
+
+# Pruning settings ===================
+
+# Controls how many pairs should be preserved per mention
+# after applying rough scoring.
+rough_k = 50
+
+
+# Training settings ==================
+
+# Controls whether to fine-tune bert_model
+bert_finetune = true
+
+# Controls the dropout rate throughout all models
+dropout_rate = 0.3
+
+# Bert learning rate (only used if bert_finetune is set)
+bert_learning_rate = 1e-5
+
+# Task learning rate
+learning_rate = 3e-4
+# learning_rate = 1e-5
+
+# For how many epochs the training is done
+train_epochs = 10
+
+# Controls the weight of binary cross entropy loss added to nlml loss
+bce_loss_weight = 0.5
+
+# The directory that will contain conll prediction files
+conll_log_dir = "data/conll_logs"
+
+# =============================================================================
+# Extra keyword arguments to be passed to bert tokenizers of specified models
+[DEFAULT.tokenizer_kwargs]
+    [DEFAULT.tokenizer_kwargs.roberta-large]
+        "add_prefix_space" = true
+
+    [DEFAULT.tokenizer_kwargs.spanbert-large-cased]
+        "do_lower_case" = false
+
+    [DEFAULT.tokenizer_kwargs.bert-large-cased]
+        "do_lower_case" = false
+
+    [DEFAULT.tokenizer_kwargs.bert-base-multilingual-cased]
+        "do_lower_case" = false
+
+# =============================================================================
+# The sections listed here do not need to make use of all config variables
+# If a variable is omitted, its default value will be used instead
+
+# -------------- new ------------
+[mbert_cased]
+bert_model = "bert-base-multilingual-cased"
+
+[xlmr]
+bert_model = "xlm-roberta-base"
+
+[mbert_uncased]
+bert_model = "bert-base-multilingual-uncased"
+# ------------------------------
+
+[roberta]
+bert_model = "roberta-large"
+
+[roberta_no_bce]
+bert_model = "roberta-large"
+bce_loss_weight = 0.0
+
+[spanbert]
+bert_model = "SpanBERT/spanbert-large-cased"
+
+[spanbert_no_bce]
+bert_model = "SpanBERT/spanbert-large-cased"
+bce_loss_weight = 0.0
+
+[bert]
+bert_model = "bert-large-cased"
+
+[longformer]
+bert_model = "allenai/longformer-large-4096"
+bert_window_size = 2048
+
+[debug]
+bert_window_size = 384
+bert_finetune = false
+device = "cpu:0"
+
+[debug_gpu]
+bert_window_size = 384
+bert_finetune = false
diff --git a/GSoC24_H/models/download_models.sh b/GSoC24_H/models/download_models.sh
@@ -13,4 +13,5 @@ wget "https://dl.fbaipublicfiles.com/GENRE/fairseq_multilingual_entity_disambigu
 tar -xzvf fairseq_multilingual_entity_disambiguation.tar.gz -C ./EL_model
 wget -P ./EL_model "http://dl.fbaipublicfiles.com/GENRE/titles_lang_all105_marisa_trie_with_redirect.pkl"
 
-# rm fairseq_multilingual_entity_disambiguation.tar.gz # remove the tar.gz file after extraction
+# remove the tar.gz files after extraction (run if successful)
+# rm wl_coref_transmucores.tar.gz files_indie.tar.gz fairseq_multilingual_entity_disambiguation.tar.gz
diff --git a/GSoC24_H/requirements.txt b/GSoC24_H/requirements.txt
@@ -24,6 +24,8 @@ requests
 nltk
 graphviz
 fairseq
+genre
 marisa-trie
 sklearn-crfsuite 
 gdown # for downloading the models from google drive
+sentencepiece
diff --git a/GSoC24_H/src/chunking/crf_chunker.py b/GSoC24_H/src/chunking/crf_chunker.py
@@ -60,7 +60,7 @@ def predict_with_crf(sent):
     )
 
     file = open(
-        "chunking/state_dicts/model/sklearn_crf_model_v2_pos_mapped_2.pkl", "rb"
+        "../../models/RE_model/files_indie/sklearn_crf_model_v2_pos_mapped_2.pkl", "rb"
     )
     crf = pickle.load(file)
     file.close()

diff --git a/GSoC24_H/src/demo.py b/GSoC24_H/src/demo.py
@@ -18,7 +18,7 @@ def load_coref_model():
     coref_model = CorefModel("models/coref_model/config.toml", "xlmr")
     coref_model.config.device = device
     coref_model.load_weights(
-        path="models/coref_model/xlmr_multi_plus_hi2.pt",
+        path="models/coref_model/model/xlmr_multi_plus_hi2.pt",
         map_location=device,
         ignore={
             "bert_optimizer",
@@ -39,10 +39,17 @@ def load_nlp_model():
 
 @st.cache_resource
 def load_el_model():
-    with open("input/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
+    # might need to manually make the following change in the fairseq model loading
+    # depending on the pytorch version being used
+    # File: `fairseq/fairseq/checkpoint_utils.py`
+    # Line: 271
+    # Reason: The model checkpoint is not "weights-only".
+    # Change to:
+    # state = torch.load(f, map_location=torch.device("cpu"), weights_only=False)
+    with open("models/EL_model/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
         trie = pickle.load(f)
     el_model = mGENRE.from_pretrained(
-        "models/fairseq_multilingual_entity_disambiguation"
+        "models/EL_model/fairseq_multilingual_entity_disambiguation"
     ).eval()
     return trie, el_model
 

diff --git a/GSoC24_H/src/entity_linking.py b/GSoC24_H/src/entity_linking.py
@@ -5,14 +5,21 @@
 
 # with open("input/lang_title2wikidataID-normalized_with_redirect.pkl", "rb") as f:
 #     lang_title2wikidataID = pickle.load(f)
-#
 
-with open("input/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
+
+    # might need to manually make the following change in the fairseq model loading
+    # depending on the pytorch version being used
+    # File: `fairseq/fairseq/checkpoint_utils.py`
+    # Line: 271
+    # Reason: The model checkpoint is not "weights-only".
+    # Change to:
+    # state = torch.load(f, map_location=torch.device("cpu"), weights_only=False)
+with open("models/EL_model/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
     trie = pickle.load(f)
 
 # generate Wikipedia titles and language IDs
 model = mGENRE.from_pretrained(
-    "models/fairseq_multilingual_entity_disambiguation"
+    "models/EL_model/fairseq_multilingual_entity_disambiguation"
 ).eval()
 
 sentences = [

diff --git a/GSoC24_H/src/indIE.py b/GSoC24_H/src/indIE.py
@@ -45,7 +45,7 @@
 }
 
 hyper_params["embedding_size"] = model_embeddings[hyper_params["bert_model_name"]]
-my_tagset = torch.load("input/my_tagset_" + hyper_params["notation"] + ".bin")
+my_tagset = torch.load("models/RE_model/files_indie/my_tagset_" + hyper_params["notation"] + ".bin")
 hyper_params["my_tagset"] = my_tagset
 
 os.environ["PYTHONHASHSEED"] = str(hyper_params["rseed"])
@@ -80,7 +80,7 @@
     print("Creating the XLM chunker model...")
     model = chunker_class(device, hyper_params).to(device)
     checkpoint = torch.load(
-        "models/state_dicts/model/" + str(hyper_params["run_ID"]) + "_epoch_4.pth.tar",
+        "models/RE_model/files_indie/" + str(hyper_params["run_ID"]) + "_epoch_4.pth.tar",
         map_location=device,
     )
     checkpoint["state_dict"].pop("model.embeddings.position_ids")

diff --git a/GSoC24_H/src/start.py b/GSoC24_H/src/start.py
@@ -46,7 +46,7 @@ def replace_mention(match):
     coref_model = CorefModel("models/coref_model/config.toml", "xlmr")
     coref_model.config.device = device
     coref_model.load_weights(
-        path="models/coref_model/xlmr_multi_plus_hi2.pt",
+        path="models/coref_model/model/xlmr_multi_plus_hi2.pt",
         map_location=device,
         ignore={
             "bert_optimizer",
@@ -56,10 +56,18 @@ def replace_mention(match):
         },
     )
     coref_model.training = False
-    with open("input/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
+
+    # might need to manually make the following change in the fairseq model loading 
+    # depending on the pytorch version being used
+    # File: `fairseq/fairseq/checkpoint_utils.py`
+    # Line: 271
+    # Reason: The model checkpoint is not "weights-only".
+    # Change to:
+    # state = torch.load(f, map_location=torch.device("cpu"), weights_only=False)
+    with open("models/EL_model/titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
         trie = pickle.load(f)
     el_model = mGENRE.from_pretrained(
-        "models/fairseq_multilingual_entity_disambiguation"
+        "models/EL_model/fairseq_multilingual_entity_disambiguation"
     ).eval()
     nlp = stanza.Pipeline(lang="hi", processors="tokenize,pos")
     # tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")

diff --git a/GSoC25_H/.gitignore b/GSoC25_H/.gitignore
@@ -0,0 +1,14 @@
+*.pkl
+*.pt
+*.model
+__pycache__/
+.ipynb_checkpoints/
+.DS_Store
+.env
+models/EL_model/fairseq_multilingual_entity_disambiguation/
+models/RE_model/files_indie/
+models/state_dicts/model.pt
+models/ontology/
+*.tar.gz
+llm_IE/finetuning/synthetic_data/
+llm_IE/full_dataset_results/
diff --git a/GSoC25_H/IndIE/.gitignore b/GSoC25_H/IndIE/.gitignore
@@ -0,0 +1,5 @@
+__pycache__/
+venv/
+chunking/data/
+chunking/state_dicts/model/
+.DS_Store
diff --git a/GSoC25_H/IndIE/README.md b/GSoC25_H/IndIE/README.md
@@ -0,0 +1,93 @@
+# IndIE
+
+This is the code for the paper titled "IndIE: A Multilingual Open Information Extraction Tool For Indic Languages" accepted in the Findings of IJCNLP-AACL 2023.
+
+You can check out the live deployment of the IndIE [here](http://103.25.231.59:80).
+
+## Installation
+
+* Git clone.
+* Make a new virtual environment.
+    * Upgrade pip by ```pip install -U pip```
+* Install the necessary libraries by using the following command:
+```pip install -r requirements.txt```
+    * If you face any difficulty in installing any library then we recommend to install it without version number. Hence, pip will install from the latest version.
+    * However, we do not recommend this for stanza library because a different version of stanza library will yield in different dependency parse trees [[source]](https://github.com/stanfordnlp/stanza/issues/990).
+
+## Download Models
+
+Download the relevant files from [here](https://drive.google.com/file/d/1UqOUdeK96m6EabI-cg2EeBz6p3IwrPZ6/view?usp=sharing).
+
+Then place the files in the directories such that your directory structure looks like this:
+
+```
+.
+├── chunking
+│   ├── chunking_model.py
+│   ├── crf_chunker.py
+│   ├── data
+│   │   └── my_tagset_BI.bin
+│   └── state_dicts
+│       └── model
+│           ├── 26_epoch_4.pth.tar
+│           └── sklearn_crf_model_v2_pos_mapped_2.pkl
+```
+
+## Memory needed
+
+The amount of memory needed varies depending upon the number of languages on which you wish to run your triple extraction.
+
+1) All languages (Hindi/Tamil/Telugu/Urdu)
+    * GPU present
+        * ~6GB on CPU/RAM
+        * ~6GB on GPU
+    * GPU absent
+        * ~7GB on CPU/RAM
+2) Only one language
+    * GPU present
+        * ~3GB on CPU/RAM
+        * ~4GB on GPU
+    * GPU absent
+        * ~3GB on CPU/RAM
+
+
+## Extracting triples
+
+Specify the language and list of strings in the ```main.py``` file. 
+
+On GPU, make sure you have same device order on nvidia-smi and PCI bus. Command: ```export CUDA_DEVICE_ORDER=PCI_BUS_ID```
+
+Run
+
+```CUDA_VISIBLE_DEVICES=0 python main.py```
+
+where ```0``` is your GPU ID. The code also runs in absence of GPU but takes a little longer. In order to run the code only on CPU, simply omit the GPU ID.
+
+## Citation
+
+```Will be updated```
+
+
+## Workflow for GSoC25_H
+1. Update hyperparameters in main.py for running the kind of extractions:
+
+```json
+'use_llm': False,  # True for running entire MDT output through LLM
+	'llm_fallback': True,  # set to True for running only on sentences which did not get any output from original rule based extractions
+	'llm_enhancement': True,  # set to True for running rule based extraction + llm extraction 
+	'llm_filter_mode': False,  # set to True for running llm based filter on the output of all the above strategies
+	'llm_model': 'gemma3:12b-it-qat'  
+```
+
+
+2. Generate extractions using ```python main.py```
+
+3. Convert the *.h5 generated file to tab sepearted extractions using ```python convert.py```
+
+4. Copy the file the IndIE/hindi-benchie/extractions folder and update the code.py file with correct file path and run ```python code.py```.
+
+
+
+
+
+