diff --git a/scripts/conceptnet2openke/README.md b/scripts/conceptnet2openke/README.md new file mode 100644 index 00000000..0e642104 --- /dev/null +++ b/scripts/conceptnet2openke/README.md @@ -0,0 +1,17 @@ +# conceptnet2openke +An end-2-end python3 script to convert conceptnet assertions into training file format of OpenKE + +### Usage: + +```shell +python3 conceptnet2openke.py cache_folder_path language topn_ents output_folder + +@params: +cache_folder_path: folder path to download and store the conceptnet assertions file +language: the language type of the concepts to extract (eg. en, ru, uk, etc) +topn_ents: number of top-most popular concepts to consider +output_folder: folder path to store OpenKE consumable files, i.e., entity2id.txt, + relation2id.txt, train/valid/test2id.txt + +``` + diff --git a/scripts/conceptnet2openke/conceptnet2openke.py b/scripts/conceptnet2openke/conceptnet2openke.py new file mode 100644 index 00000000..99a67b42 --- /dev/null +++ b/scripts/conceptnet2openke/conceptnet2openke.py @@ -0,0 +1,190 @@ +""" +@author: Rehan Ahmed +This is an end-to-end script to convert the assertions of ConceptNet to +train/test input consumable by OpenKE models for a given language. +Steps: + 1) Download the assertions file into provided cache folder. + 2) Unzip the the file to get the assertions.csv file + 3) Read and create intermediate assertions-lang.csv file which + contains the assertions for the given language at input + 4) Extract all the entities that are part of at least n assertions + 5) Create train2id.txt file +""" + +import sys +from statistics import mean +import os.path +import csv +from collections import defaultdict +from random import shuffle +from tqdm import tqdm +import requests +import gzip +import shutil + + +def download(url, save_path): + # Streaming, so we can iterate over the response. + r = requests.get(url, stream=True) + # Total size in bytes. + total_size = int(r.headers.get('content-length', 0)) + block_size = 1024 # 1 Kibibyte + t = tqdm(total=total_size, unit='iB', unit_scale=True) + with open(save_path, 'wb') as f: + for data in r.iter_content(block_size): + t.update(len(data)) + f.write(data) + t.close() + if total_size != 0 and t.n != total_size: + print("ERROR, something went wrong") + +def download_assertions(cache_folder): + if not os.path.exists(cache_folder): + try: + os.makedirs(cache_folder) + except OSError: + print("Creation of the directory %s failed" % cache_folder) + else: + print("Successfully created the directory %s" % cache_folder) + + print("Downloading conceptnet-assertions-5.7.0.csv.gz") + assertion_gz_file = "%s/conceptnet-assertions-5.7.0.csv.gz"%cache_folder + if not os.path.exists(assertion_gz_file): + url = "https://s3.amazonaws.com/conceptnet/downloads/2019/edges/conceptnet-assertions-5.7.0.csv.gz" + download(url, assertion_gz_file) + + assertion_csv_file = "%s/assertions.csv"%cache_folder + print('Extracting conceptnet.csv.gz file') + if not os.path.exists(assertion_csv_file): + with gzip.open(assertion_gz_file, 'rb') as f_in: + with open(assertion_csv_file, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + +def extract_lang_assertions(language, cache_folder): + assertions_file = '%s/assertions.csv'%cache_folder + if not os.path.isfile(assertions_file): + download_assertions(cache_folder) + + print("Extracting assertions for language: %s"%language) + + lang_str='/c/%s/'%language + assertions_lang_file = "%s/conceptnet-%s.csv"%(cache_folder, language) + + with open(assertions_file) as af, open(assertions_lang_file, 'w') as cf: + ass_reader = csv.reader(af, delimiter='\t') + for row in ass_reader: + r = row[1] + c1 = row[2] + c2 = row[3] + + if (c1.startswith(lang_str) and c2.startswith(lang_str)): + cf.write('\t'.join([c1, r, c2])) + cf.write('\n') + + +def generate_topn_ent_rel(output_folder, assertion_lang_file, topn_ents=20000): + ''' + Step 4: Extract all the entities that are part of at least n assertions + :param output_folder: + :param assertion_lang_file: + :param topn_ents: + :return: + ''' + n_ent_rel_dict_head = defaultdict(int) + n_ent_rel_dict_tail = defaultdict(int) + relations = set([]) + + with open(assertion_lang_file) as rf: + for line in rf: + row = line.strip().split('\t') + relations.add(row[1]) + n_ent_rel_dict_head[row[0]] += 1 + n_ent_rel_dict_tail[row[-1]] += 1 + + head_ents = set(n_ent_rel_dict_head.keys()) + tail_ents = set(n_ent_rel_dict_tail.keys()) + + entities = list(head_ents.union(tail_ents)) + + n_ent_avg_head_tail = {ent: mean([n_ent_rel_dict_head[ent], n_ent_rel_dict_tail[ent]]) for ent in entities} + + ordered_entities = sorted(n_ent_avg_head_tail.items(), \ + key=lambda x: x[1], \ + reverse=True)[:topn_ents] + + entity2id_file = "%s/entity2id.txt"%output_folder + with open(entity2id_file, 'w') as ef: + ef.write('%d\n'%len(ordered_entities)) + ef.write('\n'.join([ '\t'.join([v[0], str(i)]) for i, v in enumerate(ordered_entities)])) + + relation2id_file = "%s/relation2id.txt"%output_folder + relations = sorted(list(relations)) + with open(relation2id_file, 'w') as rf: + rf.write('%d\n'%len(relations)) + rf.write('\n'.join([ '\t'.join([v, str(i)]) for i, v in enumerate(relations)])) + +def create_train2id(output_folder, assertion_lang_file): + ''' + Step 5: Create train2id.txt file + :param output_folder: + :param assertion_lang_file: + :return: + ''' + entity2id_file = "%s/entity2id.txt"%output_folder + relation2id_file = "%s/relation2id.txt"%output_folder + with open(entity2id_file) as ef, open(relation2id_file) as rf: + e_rows = [line.strip().split('\t') for line in ef.readlines()[1:]] + r_rows = [line.strip().split('\t') for line in rf.readlines()[1:]] + entity2id = {row[0]:row[1] for row in e_rows} + relation2id = {row[0]:row[1] for row in r_rows} + + with open(assertion_lang_file) as af, open('%s/train2id.txt'%output_folder, 'w') as tf: + training_rows = [] + for line in af: + row = line.strip().split('\t') + if (row[0] in entity2id and row[-1] in entity2id): + training_rows.append(row) + + shuffle(training_rows) + tf.write('%d\n'%len(training_rows)) + for row in training_rows: + tf.write(' '.join([entity2id[row[0]], entity2id[row[-1]], relation2id[row[1]]])) + tf.write('\n') + + +if __name__=='__main__': + if len(sys.argv) != 5: + print('error. usage: python conceptnet2openke.py cache_folder_path language topn_ents output_folder') + else: + # folder to download ConceptNet assertions file in and store intermediate files + cache_folder = sys.argv[1] + + # language of the concepts to be extracted + language = sys.argv[2] + + # use only topn most popular concepts for generating training set + topn_ents = int(sys.argv[3]) + + # folder to store openke consumable files + output_folder = sys.argv[4] + + if not os.path.exists(output_folder): + try: + os.makedirs(output_folder) + except OSError: + print ("Creation of the directory %s failed" % output_folder) + else: + print ("Successfully created the directory %s" % output_folder) + + assertion_lang_file = "%s/conceptnet-%s.csv"%(cache_folder, language) + if not os.path.isfile(assertion_lang_file): + print("Assertions file for the language not found!") + print("Creating the Assertions file in cache folder") + extract_lang_assertions(language, cache_folder) + + print('Extracting top n-most popular concepts') + generate_topn_ent_rel(output_folder, assertion_lang_file, topn_ents) + + print('Creating train2id.txt file') + create_train2id(output_folder, assertion_lang_file) diff --git a/scripts/conceptnet2openke/requirements.txt b/scripts/conceptnet2openke/requirements.txt new file mode 100644 index 00000000..51ca25f7 --- /dev/null +++ b/scripts/conceptnet2openke/requirements.txt @@ -0,0 +1,6 @@ +certifi==2019.11.28 +chardet==3.0.4 +idna==2.9 +requests==2.23.0 +tqdm==4.43.0 +urllib3==1.25.8