From 5a26d52305d9f29ad337201c42d34ee5e9ed9f5b Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 23 May 2024 22:03:23 +0100 Subject: [PATCH 01/53] Initial species analysis, first ported loader. --- .gitignore | 1 + aggregates.py | 43 ---------------- digipres.github.io | 2 +- foreging/__init__.py | 1 + foreging/loc_fdd.py | 78 +++++++++++++++++++++++++++++ foreging/models.py | 20 ++++++++ foreging/species.py | 114 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 215 insertions(+), 44 deletions(-) create mode 100644 foreging/__init__.py create mode 100644 foreging/loc_fdd.py create mode 100644 foreging/models.py create mode 100644 foreging/species.py diff --git a/.gitignore b/.gitignore index edaacdde..c8f7fb13 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ /bin /pywikibot.lwp /passwordfile +*.pyc diff --git a/aggregates.py b/aggregates.py index b7ffd1fd..824f9dc7 100644 --- a/aggregates.py +++ b/aggregates.py @@ -87,50 +87,7 @@ def addFormat(rid,fid,finfo): fmts[rid]['formats'][fid] = finfo -def aggregateFDD(): - rid = "fdd" - print("Parsing %s..." % rid) - for filename in os.listdir('digipres.github.io/_sources/registries/fdd/fddXML'): - if filename.endswith(".xml"): - print(f"Parsing {filename}...") - # Get Identifier? - with open('digipres.github.io/_sources/registries/fdd/fddXML/'+filename, "rb") as f: - finfo = {} - finfo['source'] = filename - xml = f.read() - root = None - try: - #parser = etree.XMLParser() - #root = etree.parse(BytesIO(xml), parser) - root = BeautifulSoup(xml, "xml") - ffd_id = root.find('FDD').get('id') - finfo['name'] = root.find('FDD').get('titleName') - if root.find('magicNumbers'): - finfo['hasMagic'] = True - else: - finfo['hasMagic'] = False - # Get extensions: - extensions = list() - for fe in root.findAll('filenameExtension'): - for fev in fe.findAll('sigValue'): - extensions.append("*.%s" % fev.text) - finfo['extensions'] = extensions - # Get MIME types: - mimetypes = list() - for imts in root.findAll('internetMediaType'): - for mt in imts.findAll('sigValue'): - mimetypes.append(mt.text) - finfo['mimetypes'] = mimetypes - addFormat(rid,ffd_id,finfo) - except Exception as e: - print(f"Parsing {filename} failed: {e}") - if root: - print("XML parsed as:") - print(root.prettify()) - #print(etree.tostring(root, pretty_print=True).decode('utf-8')) - if rid in fmts: # FIXME this needs to be more robust, rather than relying on happening after 'addFormat' is called for the first time. - fmts[rid]['warnings'].append(f"Error when parsing XML from '{filename}': {e}") def aggregateTRiD(): rid = "trid" diff --git a/digipres.github.io b/digipres.github.io index f2235234..4141021d 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit f2235234b09053e41d611a7dcbc2c3268ecd3574 +Subproject commit 4141021d3e545bce081927e6bec37d253f8a4cd4 diff --git a/foreging/__init__.py b/foreging/__init__.py new file mode 100644 index 00000000..4c04cbc7 --- /dev/null +++ b/foreging/__init__.py @@ -0,0 +1 @@ +# FOrmat REGistry INdexinG \ No newline at end of file diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py new file mode 100644 index 00000000..b3e3b7f9 --- /dev/null +++ b/foreging/loc_fdd.py @@ -0,0 +1,78 @@ +import os +import logging +from bs4 import BeautifulSoup +from models import Format + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class LocFDD(): + registry_id = "loc_fdd" + source_folder = 'digipres.github.io/_sources/registries/fdd/fddXML' + warnings = [] + show_parsed_xml_on_errors = False + + def get_formats(self): + logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) + + for filename in os.listdir(self.source_folder): + if filename.endswith(".xml"): + logger.info(f"Parsing {filename}...") + with open(f"{self.source_folder}/{filename}", "rb") as f: + xml = f.read() + root = None + try: + # Alternative code that was more difficult to work with: + #parser = etree.XMLParser() + #root = etree.parse(BytesIO(xml), parser) + root = BeautifulSoup(xml, "xml") + ffd_id = root.find('FDD').get('id') + f_name = root.find('FDD').get('titleName') + if root.find('magicNumbers'): + f_magic = True + else: + f_magic = False + # Get extensions: + extensions = list() + for fe in root.findAll('filenameExtension'): + for fev in fe.findAll('sigValue'): + extensions.append("%s" % fev.text) + f_extensions = extensions + # Get MIME types: + mimetypes = list() + for imts in root.findAll('internetMediaType'): + for mt in imts.findAll('sigValue'): + mimetypes.append(mt.text) + f_mimetypes = mimetypes + # Create record: + f = Format( + registry_id=self.registry_id, + id=ffd_id, + name=f_name, + summary=root.find("shortDescription").text, + extensions=f_extensions, + media_types=f_mimetypes, + has_magic=f_magic, + primary_media_type=None, + parent_media_type=None, + registry_url=f"https://www.loc.gov/preservation/digital/formats/fdd/{ffd_id}.shtml", + registry_source_data_url=f"https://www.loc.gov/preservation/digital/formats/fdd/{ffd_id}.xml", + registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/fdd/fddXML/{ffd_id}.xml", + additional_fields= None, + last_modified=root.findAll('date')[-1].text, + ) + yield f + except Exception as e: + logger.error(f"Parsing {filename} failed: {e}") + self.warnings.append(f"Error when parsing XML from '{filename}': {e}") + # Emit extra debug info if possible: + if root and self.show_parsed_xml_on_errors: + logger.error("XML parsed as:") + logger.error(root.prettify()) + #print(etree.tostring(root, pretty_print=True).decode('utf-8')) + + +if __name__ == "__main__": + gen = LocFDD() + for f in gen.get_formats(): + print(f.model_dump_json()) \ No newline at end of file diff --git a/foreging/models.py b/foreging/models.py new file mode 100644 index 00000000..b5f5c34b --- /dev/null +++ b/foreging/models.py @@ -0,0 +1,20 @@ +from typing import List, Optional, Set, Dict +from pydantic import BaseModel, AnyHttpUrl, PastDate + +# Pydantic data model for partially normalised/star-schema format registry entries: +class Format(BaseModel): + registry_id: str + id: str + name: str + summary: str + extensions: List[str] = [] + media_types: List[str] = [] + has_magic: bool + primary_media_type: Optional[str] + parent_media_type: Optional[str] + registry_url: AnyHttpUrl + registry_source_data_url: AnyHttpUrl + created: Optional[PastDate] = None + last_modified: Optional[PastDate] = None + # A spot of any additional fields: + additional_fields: Optional[Dict[str,str]] \ No newline at end of file diff --git a/foreging/species.py b/foreging/species.py new file mode 100644 index 00000000..1277e688 --- /dev/null +++ b/foreging/species.py @@ -0,0 +1,114 @@ +# Use the idea of a Species Accumulation Curve to understand the scale of the format challenge. +import csv +import yaml +import json +import logging +import argparse +from collections import defaultdict + +logging.basicConfig(level=logging.WARNING, format='%(asctime)s: %(levelname)s - %(name)s - %(message)s') + +logger = logging.getLogger(__name__) + + +def load_extensions(): + with open('digipres.github.io/_data/formats/extensions.yml') as f: + extensions = yaml.safe_load(f) + return extensions + +def reindex_by_registry(extensions): + exts = extensions['extensions'] + ext_sets = defaultdict(set) + for ext in exts: + for id in exts[ext]['identifiers']: + ext_sets[id['regId']].add(ext.lower()) + return ext_sets + +def compute_sac(): + ext_sets = reindex_by_registry(load_extensions()) + + all_extensions = set() + sample_total = 0 + + # Go though the dict of sets, sorting them so largest sets go first (note each item is the k,v array): + # Doing this seems to make the curve fitting more robust/consistent. + print("source,num_exts,num_uniq_exts,percent_uniq_exts,total_exts,total_uniq_exts,added_uniq_exts") + for set_key, ext_set in sorted(ext_sets.items(), key=lambda item: len(item[1]), reverse=True): + sample_total += len(ext_set) + current_total = len(all_extensions) + all_extensions |= ext_set + total_added = len(all_extensions) - current_total + # Calculate the unique part, by making a copy of the set and removing all other sets from it: + unique_ext = ext_set.copy() + for other_set in ext_sets: + if other_set != set_key: + unique_ext -= ext_sets[other_set] + # Share & Enjoy: + set_size = len(ext_set) + unique_size = len(unique_ext) + print(f"{set_key},{set_size},{unique_size},{100.0*unique_size/set_size:.3f},{sample_total},{len(all_extensions)},{total_added}") + + +def _print_comparison(set_key, candidate_set, collection_set, collection_counts, collection_total): + remainder = collection_set - candidate_set + common = collection_set.intersection(candidate_set) + remainder_count = 0 + for ext in remainder: + remainder_count += collection_counts[ext] + print(f"{set_key} {len(common)} {len(remainder)} {remainder_count} {collection_total}")# {json.dumps(list(remainder))}") + +def compare_csv(csv_file): + collection_set = set() + collection_counts = {} + collection_total = 0 + with open(csv_file) as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + ext = row['extension'].lower().strip() + # Drop extensions with spaces in: + if " " in ext: + logger.warning(f"Dropping extension with space in: '{ext}'") + continue + # Drop extensions that are just numbers: + if ext.isnumeric(): + logger.warning(f"Dropping extension that appears to be just a number: '{ext}'") + continue + # Convert to standard lower-case glob format + ext = f"*.{ext}" + logger.debug(f"Found extension {ext} with file_count {row['file_count']}") + collection_set.add(ext) + collection_counts[ext] = int(row['file_count']) + collection_total += int(row['file_count']) + + ext_sets = reindex_by_registry(load_extensions()) + all_extensions = set() + for set_key, ext_set in sorted(ext_sets.items(), key=lambda item: len(item[1]), reverse=True): + all_extensions |= ext_set + _print_comparison(set_key, ext_set, collection_set, collection_counts, collection_total) + _print_comparison("_ALL_", all_extensions, collection_set, collection_counts, collection_total) + + +if __name__ == "__main__": + common_args = argparse.ArgumentParser(prog="species", add_help=False) + common_args.add_argument('-v', '--verbose', action='count', default=0, help='Logging level; add more -v for more logging.') + + parser = argparse.ArgumentParser(prog="species", add_help=True) + subparsers = parser.add_subparsers(dest="action", help='action') + + parser_sac = subparsers.add_parser('curve', parents=[common_args], help="Load the extensions and compute the Species Accumulation Curve.") + + parser_cmp = subparsers.add_parser('compare', parents=[common_args], help="Compare extensions from a CSV file with the registry contents.") + parser_cmp.add_argument('csv_file', type=str, help='CSV file to load') + + args = parser.parse_args() + + # Set up verbose logging: + if args.verbose == 1: + logging.getLogger().setLevel(logging.INFO) + elif args.verbose >= 2: + logging.getLogger().setLevel(logging.DEBUG) + + if args.action == 'curve': + compute_sac() + elif args.action == 'compare': + compare_csv(args.csv_file) \ No newline at end of file From c77ef6671cac345139d9ec164c2c164c7650980a Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 20 Jun 2024 11:15:28 +0100 Subject: [PATCH 02/53] Initial new-style PRONOM parser. --- README.md | 8 ++++ aggregates.py | 8 +++- digipres.github.io | 2 +- foreging/loc_fdd.py | 2 +- foreging/models.py | 1 + foreging/pronom.py | 91 +++++++++++++++++++++++++++++++++++++++++++++ foreging/species.py | 12 +++++- 7 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 foreging/pronom.py diff --git a/README.md b/README.md index dce7756f..af780bf4 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,14 @@ Sentinel This is the watcher that watches the watched and reports the reports to +``` +$ python -m foreging.pronom 2>&1 > pronom.jsonl +$ python -m foreging.loc_fdd 2>&1 > loc.jsonl +$ sqlite-utils insert registries.db format --nl loc.jsonl +$ sqlite-utils insert registries.db format --nl pronom.jsonl +$ sqlite-utils enable-fts registries.db format name summary extensions media_types +``` + How it works ------------ diff --git a/aggregates.py b/aggregates.py index 824f9dc7..ee28feb8 100644 --- a/aggregates.py +++ b/aggregates.py @@ -351,9 +351,9 @@ def aggregateWikiData(): with open("%s/extensions.yml" % data_dir, 'w') as outfile: outfile.write( yaml.safe_dump(extensions, default_flow_style=False) ) -# Write out Venn data +# Write out Venn data, starting from a list like [extension] -> Registry_ID: print("Outputting Venn data based on extensions...") -# Key all the RID-to-integer mappings: +# Key all the Registry_ID-to-integer mappings: vennls = {} i = 0 for fmt in fmts: @@ -364,15 +364,19 @@ def aggregateWikiData(): venndsl = defaultdict(list) vennlt = defaultdict(int) vennids = {} +# Loop over all extensions: for extension in exts: regs = set() regIds = set() + # Loop over each registry the extension appears in: for ridder in exts[extension]['identifiers']: regs.add(vennls[ridder['regId']]) regIds.add(ridder['regId']) for rid in regs: vennlt[rid] += 1 + # Build a unique key for each registry combination: key = ','.join(sorted(regs)) + # Use the key to build up each overlap set: vennids[key] = sorted(regIds) venndsl[key].append(extension) vennds[key] += 1 diff --git a/digipres.github.io b/digipres.github.io index 4141021d..648c406f 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 4141021d3e545bce081927e6bec37d253f8a4cd4 +Subproject commit 648c406f037c81abf9b4844cb4aad823609de0b5 diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index b3e3b7f9..65b77c63 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -1,7 +1,7 @@ import os import logging from bs4 import BeautifulSoup -from models import Format +from .models import Format logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/foreging/models.py b/foreging/models.py index b5f5c34b..bee2960d 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -14,6 +14,7 @@ class Format(BaseModel): parent_media_type: Optional[str] registry_url: AnyHttpUrl registry_source_data_url: AnyHttpUrl + registry_index_data_url: Optional[AnyHttpUrl] created: Optional[PastDate] = None last_modified: Optional[PastDate] = None # A spot of any additional fields: diff --git a/foreging/pronom.py b/foreging/pronom.py new file mode 100644 index 00000000..6924f5bc --- /dev/null +++ b/foreging/pronom.py @@ -0,0 +1,91 @@ +import os +import logging +import datetime +from bs4 import BeautifulSoup +from .models import Format + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +class PRONOM(): + registry_id = "pronom" + source_folder = 'digipres.github.io/_sources/registries/pronom/' + warnings = [] + show_parsed_xml_on_errors = False + + def _date_parser(self, pronom_date): + # PRONOM uses '11 Apr 2024' format so this needs parsing here: + date = datetime.datetime.strptime(pronom_date, "%d %b %Y") + return date + + def get_formats(self): + logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) + + for source_folder_name in ['fmt', 'x-fmt']: + source_folder = os.path.join(self.source_folder, source_folder_name) + + for filename in os.listdir(source_folder): + if filename.endswith(".xml"): + logger.info(f"Parsing {filename}...") + with open(f"{source_folder}/{filename}", "rb") as f: + xml = f.read() + root = None + try: + # Alternative code that was more difficult to work with: + #parser = etree.XMLParser() + #root = etree.parse(BytesIO(xml), parser) + root = BeautifulSoup(xml, "xml") + ffd_id = f"{source_folder_name}/{filename[0:-4]}" + # To Do check FileFormatIdentifier.Identifier matches for FileFormatIdentifier.Identifier.Type == 'PUID' + f_name = root.find('FormatName').text + if root.find('InternalSignature'): + f_magic = True + else: + f_magic = False + # Get extensions: + extensions = list() + for fe in root.findAll('ExternalSignature'): + if fe.find('SignatureType', string='File extension'): + extensions.append(fe.find('Signature').text) + f_extensions = extensions + # Get MIME types: + mimetypes = list() + for ffi in root.findAll('FileFormatIdentifier'): + if ffi.find('IdentifierType', string='MIME'): + mimetypes.append(ffi.find('Identifier').text) + f_mimetypes = mimetypes + # Create record: + f = Format( + registry_id=self.registry_id, + id=ffd_id, + name=f_name, + summary=root.find("FormatDescription").text, + extensions=f_extensions, + media_types=f_mimetypes, + has_magic=f_magic, + primary_media_type=None, + parent_media_type=None, + registry_url=f"https://www.nationalarchives.gov.uk/pronom/{ffd_id}", + registry_source_data_url=f"https://www.nationalarchives.gov.uk/pronom/{ffd_id}.xml", + registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/pronom/{ffd_id}.xml", + additional_fields= None, + created=self._date_parser(root.find('ProvenanceSourceDate').text), + last_modified=self._date_parser(root.find('LastUpdatedDate').text), + ) + yield f + except Exception as e: + logger.exception(f"Parsing {filename} failed", e) + self.warnings.append(f"Error when parsing XML from '{filename}': {e}") + # Emit extra debug info if possible: + if root and self.show_parsed_xml_on_errors: + logger.error("XML parsed as:") + logger.error(root.prettify()) + #print(etree.tostring(root, pretty_print=True).decode('utf-8')) + break + + +if __name__ == "__main__": + gen = PRONOM() + gen.show_parsed_xml_on_errors = True + for f in gen.get_formats(): + print(f.model_dump_json()) \ No newline at end of file diff --git a/foreging/species.py b/foreging/species.py index 1277e688..95ec05f0 100644 --- a/foreging/species.py +++ b/foreging/species.py @@ -87,6 +87,11 @@ def compare_csv(csv_file): _print_comparison(set_key, ext_set, collection_set, collection_counts, collection_total) _print_comparison("_ALL_", all_extensions, collection_set, collection_counts, collection_total) +def write_extensions(output_json): + ext_sets = reindex_by_registry(load_extensions()) + with open(output_json,"w") as f: + json.dump(ext_sets, f, default=list) + if __name__ == "__main__": common_args = argparse.ArgumentParser(prog="species", add_help=False) @@ -100,6 +105,9 @@ def compare_csv(csv_file): parser_cmp = subparsers.add_parser('compare', parents=[common_args], help="Compare extensions from a CSV file with the registry contents.") parser_cmp.add_argument('csv_file', type=str, help='CSV file to load') + parser_exts = subparsers.add_parser('extensions', parents=[common_args], help="Write the extensions data out as a JSON file.") + parser_exts.add_argument('json_file', type=str, help='JSON file to write') + args = parser.parse_args() # Set up verbose logging: @@ -111,4 +119,6 @@ def compare_csv(csv_file): if args.action == 'curve': compute_sac() elif args.action == 'compare': - compare_csv(args.csv_file) \ No newline at end of file + compare_csv(args.csv_file) + elif args.action == "extensions": + write_extensions(args.json_file) \ No newline at end of file From 2ddf18ea24c7f521c9d8beef994a0e530fbc1d7c Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 20 Jun 2024 14:01:26 +0100 Subject: [PATCH 03/53] Adding DVC setup. --- .dvc/.gitignore | 3 +++ .dvc/config | 0 .dvcignore | 3 +++ dvc.yaml | 20 ++++++++++++++++++++ 4 files changed, 26 insertions(+) create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 dvc.yaml diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 00000000..528f30c7 --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 00000000..e69de29b diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 00000000..51973055 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 00000000..62a6ee92 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,20 @@ +stages: + transform-format-data: + cmd: + - python -m foreging.pronom > pronom.jsonl + - python -m foreging.loc_fdd > loc.jsonl + deps: + - digipres.github.io/_sources/registries + outs: + - loc.jsonl + - pronom.jsonl + generate-sqlite: + cmd: + - sqlite-utils insert registries.db format --nl loc.jsonl + - sqlite-utils insert registries.db format --nl pronom.jsonl + - sqlite-utils enable-fts registries.db format name summary extensions media_types genres + deps: + - loc.jsonl + - pronom.jsonl + outs: + - registries.db \ No newline at end of file From 37ac28b1a9ac804ff8c81b17fa82cfa10bc42ca7 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 20 Jun 2024 14:23:55 +0100 Subject: [PATCH 04/53] Working dvc repro, added genres. --- .gitignore | 3 +++ dvc.lock | 46 +++++++++++++++++++++++++++++++++++++++++++++ dvc.yaml | 9 +++++---- foreging/loc_fdd.py | 14 ++++++++++++-- foreging/models.py | 3 ++- foreging/pronom.py | 9 +++++++-- 6 files changed, 75 insertions(+), 9 deletions(-) create mode 100644 dvc.lock diff --git a/.gitignore b/.gitignore index c8f7fb13..47adea1d 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ /pywikibot.lwp /passwordfile *.pyc +/registries.db +/loc.jsonl +/pronom.jsonl diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 00000000..dda35098 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,46 @@ +schema: '2.0' +stages: + transform-format-data: + cmd: + - python -m foreging.pronom > pronom.jsonl + - python -m foreging.loc_fdd > loc.jsonl + deps: + - path: digipres.github.io/_sources/registries + hash: md5 + md5: 6f3d9b2b5ba1f6fdc7a123e86579852d.dir + size: 211853546 + nfiles: 26983 + - path: foreging + hash: md5 + md5: a42a3fc4caca882d4e69d4d060a285b6.dir + size: 22005 + nfiles: 9 + outs: + - path: loc.jsonl + hash: md5 + md5: 380914567a5637ad1a6c075403b1db89 + size: 503541 + - path: pronom.jsonl + hash: md5 + md5: f000bb466510df9f0f8a8378fe71543f + size: 2368966 + generate-sqlite: + cmd: + - sqlite-utils insert registries.db formats --nl loc.jsonl + - sqlite-utils insert registries.db formats --nl pronom.jsonl + - sqlite-utils enable-fts registries.db formats name summary extensions iana_media_types + genres + deps: + - path: loc.jsonl + hash: md5 + md5: 380914567a5637ad1a6c075403b1db89 + size: 503541 + - path: pronom.jsonl + hash: md5 + md5: f000bb466510df9f0f8a8378fe71543f + size: 2368966 + outs: + - path: registries.db + hash: md5 + md5: cdb4dfcf2d6d6eb0bef2029ef5bef17a + size: 3174400 diff --git a/dvc.yaml b/dvc.yaml index 62a6ee92..d4a59e80 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -4,15 +4,16 @@ stages: - python -m foreging.pronom > pronom.jsonl - python -m foreging.loc_fdd > loc.jsonl deps: - - digipres.github.io/_sources/registries + - digipres.github.io/_sources/registries # source data + - foreging # source code outs: - loc.jsonl - pronom.jsonl generate-sqlite: cmd: - - sqlite-utils insert registries.db format --nl loc.jsonl - - sqlite-utils insert registries.db format --nl pronom.jsonl - - sqlite-utils enable-fts registries.db format name summary extensions media_types genres + - sqlite-utils insert registries.db formats --nl loc.jsonl + - sqlite-utils insert registries.db formats --nl pronom.jsonl + - sqlite-utils enable-fts registries.db formats name summary extensions iana_media_types genres deps: - loc.jsonl - pronom.jsonl diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index 65b77c63..6cb33c40 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -28,6 +28,12 @@ def get_formats(self): root = BeautifulSoup(xml, "xml") ffd_id = root.find('FDD').get('id') f_name = root.find('FDD').get('titleName') + # Genre: + f_genres = list() + for gns in root.findAll('gdfrGenreSelection'): + for gn in gns.findAll('gdfrGenre'): + f_genres.append(f"gdfr:{gn.text}") + # Haz Magic? if root.find('magicNumbers'): f_magic = True else: @@ -44,14 +50,17 @@ def get_formats(self): for mt in imts.findAll('sigValue'): mimetypes.append(mt.text) f_mimetypes = mimetypes + # Find the date: + edit_date = root.findAll('date')[-1].text # Create record: f = Format( registry_id=self.registry_id, id=ffd_id, name=f_name, summary=root.find("shortDescription").text, + genres=f_genres, extensions=f_extensions, - media_types=f_mimetypes, + iana_media_types=f_mimetypes, has_magic=f_magic, primary_media_type=None, parent_media_type=None, @@ -59,7 +68,8 @@ def get_formats(self): registry_source_data_url=f"https://www.loc.gov/preservation/digital/formats/fdd/{ffd_id}.xml", registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/fdd/fddXML/{ffd_id}.xml", additional_fields= None, - last_modified=root.findAll('date')[-1].text, + created=edit_date, + last_modified=edit_date, ) yield f except Exception as e: diff --git a/foreging/models.py b/foreging/models.py index bee2960d..7befb20d 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -7,8 +7,9 @@ class Format(BaseModel): id: str name: str summary: str + genres: List[str] = [] extensions: List[str] = [] - media_types: List[str] = [] + iana_media_types: List[str] = [] has_magic: bool primary_media_type: Optional[str] parent_media_type: Optional[str] diff --git a/foreging/pronom.py b/foreging/pronom.py index 6924f5bc..9d5723ba 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -36,8 +36,12 @@ def get_formats(self): #root = etree.parse(BytesIO(xml), parser) root = BeautifulSoup(xml, "xml") ffd_id = f"{source_folder_name}/{filename[0:-4]}" - # To Do check FileFormatIdentifier.Identifier matches for FileFormatIdentifier.Identifier.Type == 'PUID' f_name = root.find('FormatName').text + # Genres: + f_types = root.find('FormatTypes').text.strip().split(',') + if( len(f_types) == 0 ): + f_types = [""] + # Internal signatures: if root.find('InternalSignature'): f_magic = True else: @@ -60,8 +64,9 @@ def get_formats(self): id=ffd_id, name=f_name, summary=root.find("FormatDescription").text, + genres=f_types, extensions=f_extensions, - media_types=f_mimetypes, + iana_media_types=f_mimetypes, has_magic=f_magic, primary_media_type=None, parent_media_type=None, From 643672ccd2098b3482d9f2091a2a30d786962615 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 20 Jun 2024 14:53:59 +0100 Subject: [PATCH 05/53] Clean up PRONOM genres. --- README.md | 8 -------- dvc.lock | 16 ++++++++-------- dvc.yaml | 2 +- foreging/pronom.py | 4 ++++ 4 files changed, 13 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index af780bf4..dce7756f 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,6 @@ Sentinel This is the watcher that watches the watched and reports the reports to -``` -$ python -m foreging.pronom 2>&1 > pronom.jsonl -$ python -m foreging.loc_fdd 2>&1 > loc.jsonl -$ sqlite-utils insert registries.db format --nl loc.jsonl -$ sqlite-utils insert registries.db format --nl pronom.jsonl -$ sqlite-utils enable-fts registries.db format name summary extensions media_types -``` - How it works ------------ diff --git a/dvc.lock b/dvc.lock index dda35098..694c3e9f 100644 --- a/dvc.lock +++ b/dvc.lock @@ -12,8 +12,8 @@ stages: nfiles: 26983 - path: foreging hash: md5 - md5: a42a3fc4caca882d4e69d4d060a285b6.dir - size: 22005 + md5: 3d4810f7fa7c96d09297bf43a15acd0d.dir + size: 22575 nfiles: 9 outs: - path: loc.jsonl @@ -22,8 +22,8 @@ stages: size: 503541 - path: pronom.jsonl hash: md5 - md5: f000bb466510df9f0f8a8378fe71543f - size: 2368966 + md5: ee8fe400d5fbf6b405b39e77e7b713b8 + size: 2372708 generate-sqlite: cmd: - sqlite-utils insert registries.db formats --nl loc.jsonl @@ -37,10 +37,10 @@ stages: size: 503541 - path: pronom.jsonl hash: md5 - md5: f000bb466510df9f0f8a8378fe71543f - size: 2368966 + md5: ee8fe400d5fbf6b405b39e77e7b713b8 + size: 2372708 outs: - path: registries.db hash: md5 - md5: cdb4dfcf2d6d6eb0bef2029ef5bef17a - size: 3174400 + md5: 479eb87b8af1d3208b57f59137f6719b + size: 3182592 diff --git a/dvc.yaml b/dvc.yaml index d4a59e80..fadaccbf 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -6,7 +6,7 @@ stages: deps: - digipres.github.io/_sources/registries # source data - foreging # source code - outs: + outs: # Note that 'outs' means DVC will wipe them before running: - loc.jsonl - pronom.jsonl generate-sqlite: diff --git a/foreging/pronom.py b/foreging/pronom.py index 9d5723ba..9e01f8ef 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -41,6 +41,10 @@ def get_formats(self): f_types = root.find('FormatTypes').text.strip().split(',') if( len(f_types) == 0 ): f_types = [""] + # Strip whitespace from genres: + f_types = [g.strip() for g in f_types] + # Replace empty strings with "Undefined" + f_types = ['undefined' if not g else g for g in f_types] # Internal signatures: if root.find('InternalSignature'): f_magic = True From 355ef824cc62a5737e741db8313538f01f0974ee Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 11 Jul 2024 11:26:59 +0100 Subject: [PATCH 06/53] Extend the new model, add NARA grabber. --- digipres.github.io | 2 +- foreging/models.py | 2 +- foreging/nara.py | 83 +++++++++++++++++++++++++++++++++++++++++++++ foreging/species.py | 9 ++--- setup.sh | 1 + 5 files changed, 91 insertions(+), 6 deletions(-) create mode 100644 foreging/nara.py diff --git a/digipres.github.io b/digipres.github.io index 648c406f..35b8cd80 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 648c406f037c81abf9b4844cb4aad823609de0b5 +Subproject commit 35b8cd809ed4dd4819cca09fb15356c20a55521f diff --git a/foreging/models.py b/foreging/models.py index 7befb20d..7690645f 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -19,4 +19,4 @@ class Format(BaseModel): created: Optional[PastDate] = None last_modified: Optional[PastDate] = None # A spot of any additional fields: - additional_fields: Optional[Dict[str,str]] \ No newline at end of file + additional_fields: Optional[Dict[str,List[str]]] \ No newline at end of file diff --git a/foreging/nara.py b/foreging/nara.py new file mode 100644 index 00000000..6bd5a682 --- /dev/null +++ b/foreging/nara.py @@ -0,0 +1,83 @@ +import os +import logging +from rdflib import Graph, RDF, DCTERMS +from rdflib.namespace import DefinedNamespace, Namespace +from rdflib.term import URIRef +from .models import Format + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# +# Define RDF entities needed to work with this source: +# +class NARA(DefinedNamespace): + FileFormat: URIRef # File Format + category: URIRef + formatName: URIRef + preservationAction: URIRef + preservationPlan: URIRef + riskLevel: URIRef + tools: URIRef + + _NS = Namespace("https://www.archives.gov/data/lod/dpframework/def/") + +class WDT(DefinedNamespace): + p1163: URIRef # Media Type + p1195: URIRef # File Extension + p2748: URIRef # PRONOM link + p3381: URIRef # File Formats Wiki link + p973: URIRef # Wikipedia link + + _NS = Namespace("http://www.wikidata.org/entity/") + + + +# +# NARA File Format Preservation Plan parser +# +class NARA_FFPP(): + registry_id = "nara_ffpp" + source_file = 'digipres.github.io/_sources/registries/nara/fileformats.ttl' + warnings = [] + + def get_formats(self): + logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) + + g = Graph() + g.parse(self.source_file) + + + for s, p, o in g.triples((None, RDF.type, NARA.FileFormat)): + ff_id = g.value(s, DCTERMS.identifier) + additionals = {} + for p in [ NARA.preservationAction, NARA.preservationPlan, NARA.tools, WDT.p2748, WDT.p3381, WDT.p973]: + value = g.value(s, p) + if value: + additionals[p] = [o for s, p, o in g.triples((s, p, None))] + # Set up as a format entity: + f = Format( + registry_id=self.registry_id, + id=ff_id, + name=g.value(s, NARA.formatName), + summary=g.value(s, DCTERMS.description), + genres=[o for s, p, o in g.triples((s, NARA.category, None))], + extensions=[o for s, p, o in g.triples((s, WDT.p1195, None))], + iana_media_types=[o for s, p, o in g.triples((s, WDT.p1163, None))], + has_magic=False, + primary_media_type=None, + parent_media_type=None, + registry_url=f"https://www.archives.gov/preservation/digital-preservation/linked-data#{ff_id}", + registry_source_data_url=f"https://www.archives.gov/files/lod/dpframework/id/{ff_id}.ttl", + registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/nara/fileformats.ttl#{ff_id}", + additional_fields=additionals, + created=None, + last_modified=None, + ) + yield f + + +if __name__ == "__main__": + gen = NARA_FFPP() + for f in gen.get_formats(): + print(f.model_dump_json()) \ No newline at end of file diff --git a/foreging/species.py b/foreging/species.py index 95ec05f0..2baf7c7b 100644 --- a/foreging/species.py +++ b/foreging/species.py @@ -111,10 +111,11 @@ def write_extensions(output_json): args = parser.parse_args() # Set up verbose logging: - if args.verbose == 1: - logging.getLogger().setLevel(logging.INFO) - elif args.verbose >= 2: - logging.getLogger().setLevel(logging.DEBUG) + if 'verbose' in args: + if args.verbose == 1: + logging.getLogger().setLevel(logging.INFO) + elif args.verbose >= 2: + logging.getLogger().setLevel(logging.DEBUG) if args.action == 'curve': compute_sac() diff --git a/setup.sh b/setup.sh index 70e71254..cc5a5968 100755 --- a/setup.sh +++ b/setup.sh @@ -20,6 +20,7 @@ pip install requests pip install pyyaml pip install beautifulsoup4 pip install lxml +pip install rdflib # Running... echo "And login..." From 61b361050ce0a3ff187ec1dc5dcc59c785e8983a Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 11 Jul 2024 11:29:39 +0100 Subject: [PATCH 07/53] Add NARA to DVC workflow. --- .gitignore | 1 + dvc.lock | 34 ++++++++++++++++++++++------------ dvc.yaml | 4 ++++ 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 47adea1d..bae1a9a0 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ /registries.db /loc.jsonl /pronom.jsonl +/nara.jsonl diff --git a/dvc.lock b/dvc.lock index 694c3e9f..d604b008 100644 --- a/dvc.lock +++ b/dvc.lock @@ -4,30 +4,36 @@ stages: cmd: - python -m foreging.pronom > pronom.jsonl - python -m foreging.loc_fdd > loc.jsonl + - python -m foreging.nara > nara.jsonl deps: - path: digipres.github.io/_sources/registries hash: md5 - md5: 6f3d9b2b5ba1f6fdc7a123e86579852d.dir - size: 211853546 - nfiles: 26983 + md5: 19b255a97dce0ce49a396d4c7ddcdaa8.dir + size: 216499227 + nfiles: 27072 - path: foreging hash: md5 - md5: 3d4810f7fa7c96d09297bf43a15acd0d.dir - size: 22575 - nfiles: 9 + md5: 85ed7928fc1ba6db8855fa23adbe8e4a.dir + size: 32957 + nfiles: 12 outs: - path: loc.jsonl hash: md5 md5: 380914567a5637ad1a6c075403b1db89 size: 503541 + - path: nara.jsonl + hash: md5 + md5: 9e00363e4dfdd006d597aeb9bf7cdeab + size: 1257013 - path: pronom.jsonl hash: md5 - md5: ee8fe400d5fbf6b405b39e77e7b713b8 - size: 2372708 + md5: a1e49e58d2be69a27ad2e5f75203e37a + size: 2376855 generate-sqlite: cmd: - sqlite-utils insert registries.db formats --nl loc.jsonl - sqlite-utils insert registries.db formats --nl pronom.jsonl + - sqlite-utils insert registries.db formats --nl nara.jsonl - sqlite-utils enable-fts registries.db formats name summary extensions iana_media_types genres deps: @@ -35,12 +41,16 @@ stages: hash: md5 md5: 380914567a5637ad1a6c075403b1db89 size: 503541 + - path: nara.jsonl + hash: md5 + md5: 9e00363e4dfdd006d597aeb9bf7cdeab + size: 1257013 - path: pronom.jsonl hash: md5 - md5: ee8fe400d5fbf6b405b39e77e7b713b8 - size: 2372708 + md5: a1e49e58d2be69a27ad2e5f75203e37a + size: 2376855 outs: - path: registries.db hash: md5 - md5: 479eb87b8af1d3208b57f59137f6719b - size: 3182592 + md5: 81d65d6f0e722015f8cbf8c8828b069f + size: 4759552 diff --git a/dvc.yaml b/dvc.yaml index fadaccbf..269bf1b3 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -3,19 +3,23 @@ stages: cmd: - python -m foreging.pronom > pronom.jsonl - python -m foreging.loc_fdd > loc.jsonl + - python -m foreging.nara > nara.jsonl deps: - digipres.github.io/_sources/registries # source data - foreging # source code outs: # Note that 'outs' means DVC will wipe them before running: - loc.jsonl - pronom.jsonl + - nara.jsonl generate-sqlite: cmd: - sqlite-utils insert registries.db formats --nl loc.jsonl - sqlite-utils insert registries.db formats --nl pronom.jsonl + - sqlite-utils insert registries.db formats --nl nara.jsonl - sqlite-utils enable-fts registries.db formats name summary extensions iana_media_types genres deps: - loc.jsonl + - nara.jsonl - pronom.jsonl outs: - registries.db \ No newline at end of file From 694d4df9659ee056de10497c430ece700aed5f48 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Tue, 10 Sep 2024 11:37:09 +0100 Subject: [PATCH 08/53] Added initial TCDB parser. --- foreging/tcdb.py | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 foreging/tcdb.py diff --git a/foreging/tcdb.py b/foreging/tcdb.py new file mode 100644 index 00000000..b4315b90 --- /dev/null +++ b/foreging/tcdb.py @@ -0,0 +1,55 @@ +import os +import csv +import logging +from .models import Format + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# +# +# TCDB CSV dump parser +# +class TCDB(): + registry_id = "tcdb" + source_file = 'digipres.github.io/_sources/registries/tcdb/TCDB_2003.8_data-cleaned.csv' + warnings = [] + + def get_formats(self): + logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) + + # Open, coping with Unicode BOM + with open(self.source_file, "r", encoding='utf-8-sig') as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + additionals = { + 'mac-type': [row['Type']], + 'mac-creator': [row['Creator']] + } + # Set up as a format entity: + f = Format( + registry_id=self.registry_id, + id=f"tcdb:{row['Type']}:{row['Creator']}", + name=row['File Name'], + version=None, + summary=f"Type: {row['Type']}, Creator: {row['Creator']}, Comments: {row['Comments']}", + genres=[row['Category']], + extensions=[row['Extension']], + iana_media_types=[], + has_magic=False, + primary_media_type=None, + parent_media_type=None, + registry_url=f"https://github.com/thorsted/Born-Digital-Scripts/tree/main/TC%20Identification", + registry_source_data_url=f"https://github.com/thorsted/Born-Digital-Scripts/blob/main/TC%20Identification/TCDB_2003.8_data-cleaned.csv", + registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/tcdb/TCDB_2003.8_data-cleaned.csv", + additional_fields=additionals, + created=None, + last_modified=None, + ) + yield f + + +if __name__ == "__main__": + gen = TCDB() + for f in gen.get_formats(): + print(f.model_dump_json()) \ No newline at end of file From 8f9c5d5097e4ca71d17ed6d51397fce3969b1a82 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Tue, 10 Sep 2024 11:37:32 +0100 Subject: [PATCH 09/53] Added in some version support. --- foreging/loc_fdd.py | 5 +++++ foreging/models.py | 1 + foreging/nara.py | 1 + foreging/pronom.py | 1 + 4 files changed, 8 insertions(+) diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index 6cb33c40..f8f0bab5 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -28,6 +28,10 @@ def get_formats(self): root = BeautifulSoup(xml, "xml") ffd_id = root.find('FDD').get('id') f_name = root.find('FDD').get('titleName') + # If there's a version string, grab it: + f_version = None + if ", Version " in f_name: + f_version = f_name.split(", Version ", 1)[1] # Genre: f_genres = list() for gns in root.findAll('gdfrGenreSelection'): @@ -57,6 +61,7 @@ def get_formats(self): registry_id=self.registry_id, id=ffd_id, name=f_name, + version=f_version, summary=root.find("shortDescription").text, genres=f_genres, extensions=f_extensions, diff --git a/foreging/models.py b/foreging/models.py index 7690645f..75bb1dee 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -6,6 +6,7 @@ class Format(BaseModel): registry_id: str id: str name: str + version: Optional[str] summary: str genres: List[str] = [] extensions: List[str] = [] diff --git a/foreging/nara.py b/foreging/nara.py index 6bd5a682..e057a7a3 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -60,6 +60,7 @@ def get_formats(self): registry_id=self.registry_id, id=ff_id, name=g.value(s, NARA.formatName), + version=None, summary=g.value(s, DCTERMS.description), genres=[o for s, p, o in g.triples((s, NARA.category, None))], extensions=[o for s, p, o in g.triples((s, WDT.p1195, None))], diff --git a/foreging/pronom.py b/foreging/pronom.py index 9e01f8ef..ae604371 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -67,6 +67,7 @@ def get_formats(self): registry_id=self.registry_id, id=ffd_id, name=f_name, + version=root.find("FormatVersion").text, summary=root.find("FormatDescription").text, genres=f_types, extensions=f_extensions, From 78a31a0692cb789e81d84a3e77ddcbc35b704d70 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Tue, 10 Sep 2024 11:37:47 +0100 Subject: [PATCH 10/53] Updated data hashes. --- digipres.github.io | 2 +- dvc.lock | 38 +++++++++++++++++++------------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/digipres.github.io b/digipres.github.io index 35b8cd80..1e43c1d9 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 35b8cd809ed4dd4819cca09fb15356c20a55521f +Subproject commit 1e43c1d9bcc5c1e5920005f469ea3ccde385510f diff --git a/dvc.lock b/dvc.lock index d604b008..bc1cc721 100644 --- a/dvc.lock +++ b/dvc.lock @@ -8,27 +8,27 @@ stages: deps: - path: digipres.github.io/_sources/registries hash: md5 - md5: 19b255a97dce0ce49a396d4c7ddcdaa8.dir - size: 216499227 - nfiles: 27072 + md5: 2fbddb654293f1999c201be2db60b393.dir + size: 219597556 + nfiles: 27236 - path: foreging hash: md5 - md5: 85ed7928fc1ba6db8855fa23adbe8e4a.dir - size: 32957 + md5: 9f5a3d0919d80673cab7b5352513b3bb.dir + size: 33535 nfiles: 12 outs: - path: loc.jsonl hash: md5 - md5: 380914567a5637ad1a6c075403b1db89 - size: 503541 + md5: 40f36b114a5d92d33b2766acddf2d11f + size: 513352 - path: nara.jsonl hash: md5 - md5: 9e00363e4dfdd006d597aeb9bf7cdeab - size: 1257013 + md5: b7e77b685e0514e37bc16cefdc187db1 + size: 1267948 - path: pronom.jsonl hash: md5 - md5: a1e49e58d2be69a27ad2e5f75203e37a - size: 2376855 + md5: 780f7bf1add431a306d7e8874e97be33 + size: 2416106 generate-sqlite: cmd: - sqlite-utils insert registries.db formats --nl loc.jsonl @@ -39,18 +39,18 @@ stages: deps: - path: loc.jsonl hash: md5 - md5: 380914567a5637ad1a6c075403b1db89 - size: 503541 + md5: 40f36b114a5d92d33b2766acddf2d11f + size: 513352 - path: nara.jsonl hash: md5 - md5: 9e00363e4dfdd006d597aeb9bf7cdeab - size: 1257013 + md5: b7e77b685e0514e37bc16cefdc187db1 + size: 1267948 - path: pronom.jsonl hash: md5 - md5: a1e49e58d2be69a27ad2e5f75203e37a - size: 2376855 + md5: 780f7bf1add431a306d7e8874e97be33 + size: 2416106 outs: - path: registries.db hash: md5 - md5: 81d65d6f0e722015f8cbf8c8828b069f - size: 4759552 + md5: b07a80e8ad84f0f007d08e1ae7cb3ac4 + size: 4767744 From 03a20632bc70f27d7e477a058361cc60c5cc376b Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 13 Feb 2025 13:10:19 +0000 Subject: [PATCH 11/53] Switched to Makefile, extended TCDB support. --- .dvc/.gitignore | 3 --- .dvc/config | 0 .dvcignore | 3 --- .gitignore | 6 ++--- Makefile | 24 +++++++++++++++++++ digipres.github.io | 2 +- dvc.lock | 56 --------------------------------------------- dvc.yaml | 25 -------------------- foreging/loc_fdd.py | 2 +- foreging/models.py | 29 +++++++++++++++++------ foreging/nara.py | 2 +- foreging/tcdb.py | 24 +++++++++++++------ pyproject.toml | 15 ++++++++++++ 13 files changed, 84 insertions(+), 107 deletions(-) delete mode 100644 .dvc/.gitignore delete mode 100644 .dvc/config delete mode 100644 .dvcignore create mode 100644 Makefile delete mode 100644 dvc.lock delete mode 100644 dvc.yaml create mode 100644 pyproject.toml diff --git a/.dvc/.gitignore b/.dvc/.gitignore deleted file mode 100644 index 528f30c7..00000000 --- a/.dvc/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/config.local -/tmp -/cache diff --git a/.dvc/config b/.dvc/config deleted file mode 100644 index e69de29b..00000000 diff --git a/.dvcignore b/.dvcignore deleted file mode 100644 index 51973055..00000000 --- a/.dvcignore +++ /dev/null @@ -1,3 +0,0 @@ -# Add patterns of files dvc should ignore, which could improve -# the performance. Learn more at -# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore index bae1a9a0..5b183b5f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,6 @@ /passwordfile *.pyc /registries.db -/loc.jsonl -/pronom.jsonl -/nara.jsonl +/data +/.venv +/build diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..88cffd55 --- /dev/null +++ b/Makefile @@ -0,0 +1,24 @@ + +DATAFILES := data/pronom.jsonl data/loc.jsonl data/nara.jsonl data/tcdb.jsonl + +all: data/registries.db + +data/pronom.jsonl: foreging digipres.github.io/_sources/registries + python -m foreging.pronom > $@ + +data/loc.jsonl: foreging digipres.github.io/_sources/registries + python -m foreging.loc_fdd > $@ + +data/nara.jsonl: foreging digipres.github.io/_sources/registries + python -m foreging.nara > $@ + +data/tcdb.jsonl: foreging digipres.github.io/_sources/registries + python -m foreging.tcdb > $@ + +data/registries.db: $(DATAFILES) + rm -f $@ + sqlite-utils insert $@ formats --nl data/pronom.jsonl + sqlite-utils insert $@ formats --nl data/loc.jsonl + sqlite-utils insert $@ formats --nl data/nara.jsonl + sqlite-utils insert $@ formats --nl data/tcdb.jsonl + sqlite-utils enable-fts $@ formats name summary extensions iana_media_types genres \ No newline at end of file diff --git a/digipres.github.io b/digipres.github.io index 1e43c1d9..c568e23d 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 1e43c1d9bcc5c1e5920005f469ea3ccde385510f +Subproject commit c568e23d7f9f2f7a20115ba90f255f14494c98ec diff --git a/dvc.lock b/dvc.lock deleted file mode 100644 index bc1cc721..00000000 --- a/dvc.lock +++ /dev/null @@ -1,56 +0,0 @@ -schema: '2.0' -stages: - transform-format-data: - cmd: - - python -m foreging.pronom > pronom.jsonl - - python -m foreging.loc_fdd > loc.jsonl - - python -m foreging.nara > nara.jsonl - deps: - - path: digipres.github.io/_sources/registries - hash: md5 - md5: 2fbddb654293f1999c201be2db60b393.dir - size: 219597556 - nfiles: 27236 - - path: foreging - hash: md5 - md5: 9f5a3d0919d80673cab7b5352513b3bb.dir - size: 33535 - nfiles: 12 - outs: - - path: loc.jsonl - hash: md5 - md5: 40f36b114a5d92d33b2766acddf2d11f - size: 513352 - - path: nara.jsonl - hash: md5 - md5: b7e77b685e0514e37bc16cefdc187db1 - size: 1267948 - - path: pronom.jsonl - hash: md5 - md5: 780f7bf1add431a306d7e8874e97be33 - size: 2416106 - generate-sqlite: - cmd: - - sqlite-utils insert registries.db formats --nl loc.jsonl - - sqlite-utils insert registries.db formats --nl pronom.jsonl - - sqlite-utils insert registries.db formats --nl nara.jsonl - - sqlite-utils enable-fts registries.db formats name summary extensions iana_media_types - genres - deps: - - path: loc.jsonl - hash: md5 - md5: 40f36b114a5d92d33b2766acddf2d11f - size: 513352 - - path: nara.jsonl - hash: md5 - md5: b7e77b685e0514e37bc16cefdc187db1 - size: 1267948 - - path: pronom.jsonl - hash: md5 - md5: 780f7bf1add431a306d7e8874e97be33 - size: 2416106 - outs: - - path: registries.db - hash: md5 - md5: b07a80e8ad84f0f007d08e1ae7cb3ac4 - size: 4767744 diff --git a/dvc.yaml b/dvc.yaml deleted file mode 100644 index 269bf1b3..00000000 --- a/dvc.yaml +++ /dev/null @@ -1,25 +0,0 @@ -stages: - transform-format-data: - cmd: - - python -m foreging.pronom > pronom.jsonl - - python -m foreging.loc_fdd > loc.jsonl - - python -m foreging.nara > nara.jsonl - deps: - - digipres.github.io/_sources/registries # source data - - foreging # source code - outs: # Note that 'outs' means DVC will wipe them before running: - - loc.jsonl - - pronom.jsonl - - nara.jsonl - generate-sqlite: - cmd: - - sqlite-utils insert registries.db formats --nl loc.jsonl - - sqlite-utils insert registries.db formats --nl pronom.jsonl - - sqlite-utils insert registries.db formats --nl nara.jsonl - - sqlite-utils enable-fts registries.db formats name summary extensions iana_media_types genres - deps: - - loc.jsonl - - nara.jsonl - - pronom.jsonl - outs: - - registries.db \ No newline at end of file diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index f8f0bab5..4015ed5f 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -7,7 +7,7 @@ logger = logging.getLogger(__name__) class LocFDD(): - registry_id = "loc_fdd" + registry_id = "loc-fdd" source_folder = 'digipres.github.io/_sources/registries/fdd/fddXML' warnings = [] show_parsed_xml_on_errors = False diff --git a/foreging/models.py b/foreging/models.py index 75bb1dee..f866cc8c 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -7,17 +7,32 @@ class Format(BaseModel): id: str name: str version: Optional[str] - summary: str + summary: Optional[str] = None + registry_url: AnyHttpUrl + registry_source_data_url: AnyHttpUrl + registry_index_data_url: Optional[AnyHttpUrl] + created: Optional[PastDate] = None + last_modified: Optional[PastDate] = None + # A spot of any additional fields: + additional_fields: Optional[Dict[str,List[str]]] + # Fields relating to format: genres: List[str] = [] extensions: List[str] = [] iana_media_types: List[str] = [] has_magic: bool primary_media_type: Optional[str] parent_media_type: Optional[str] + +# Pydantic data model for information about software and the formats the software may read or write: +class Software(BaseModel): + registry_id: str + id: str + name: str + version: Optional[str] + summary: str registry_url: AnyHttpUrl - registry_source_data_url: AnyHttpUrl - registry_index_data_url: Optional[AnyHttpUrl] - created: Optional[PastDate] = None - last_modified: Optional[PastDate] = None - # A spot of any additional fields: - additional_fields: Optional[Dict[str,List[str]]] \ No newline at end of file + # + license + # Relationships to Formats: + can_read: List[Format] = [] + can_write: List[Format] = [] diff --git a/foreging/nara.py b/foreging/nara.py index e057a7a3..424298b4 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -37,7 +37,7 @@ class WDT(DefinedNamespace): # NARA File Format Preservation Plan parser # class NARA_FFPP(): - registry_id = "nara_ffpp" + registry_id = "nara-ffpp" source_file = 'digipres.github.io/_sources/registries/nara/fileformats.ttl' warnings = [] diff --git a/foreging/tcdb.py b/foreging/tcdb.py index b4315b90..f206cc41 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -22,19 +22,28 @@ def get_formats(self): with open(self.source_file, "r", encoding='utf-8-sig') as csv_file: reader = csv.DictReader(csv_file) for row in reader: + logger.debug(f"Processing row: {row}") + type_code = row['Type'].strip() + creator_code = row['Creator'].strip() + extension = row['Extension'].strip() + category = row['Category'].strip() + name = row['File Name'].strip() + comments = row['Comments'].strip() + # Store additional fields: additionals = { - 'mac-type': [row['Type']], - 'mac-creator': [row['Creator']] + 'mac-type-code': [type_code], + 'mac-creator-code': [creator_code], + 'comments': [comments] } # Set up as a format entity: f = Format( registry_id=self.registry_id, - id=f"tcdb:{row['Type']}:{row['Creator']}", - name=row['File Name'], + id=f"tcdb:{type_code}:{creator_code}", + name=name, version=None, - summary=f"Type: {row['Type']}, Creator: {row['Creator']}, Comments: {row['Comments']}", - genres=[row['Category']], - extensions=[row['Extension']], + summary=None, + genres=[category] if category else [], + extensions=[extension] if extension else [], iana_media_types=[], has_magic=False, primary_media_type=None, @@ -46,6 +55,7 @@ def get_formats(self): created=None, last_modified=None, ) + logger.debug(f"Generated format: {f}") yield f diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..ef7e6115 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,15 @@ +[project] +name = "sentinel" +version = "2.0.0" +dependencies = [ + "requests", + "pyyaml", + "beautifulsoup4", + "lxml", + "rdflib", + "pydantic", + "sqlite-utils", +] + +[tool.setuptools.packages.find] +include = ["foreging"] \ No newline at end of file From d4d32a05de4ffe4cd603ae1938865e6cc57de233 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 13 Feb 2025 13:13:16 +0000 Subject: [PATCH 12/53] Updated pywikibot submodule. --- pywikibot | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywikibot b/pywikibot index 2d6a3f78..c143841e 160000 --- a/pywikibot +++ b/pywikibot @@ -1 +1 @@ -Subproject commit 2d6a3f78b281fafe8113223f064384a4577d2ae4 +Subproject commit c143841e21d2cc753036ea19e30b2eb38ccbda48 From 4612a825b10d64dbf9650c54d1cc91a3474fa010 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 13 Feb 2025 13:19:43 +0000 Subject: [PATCH 13/53] Notes to update. --- Makefile | 2 +- README.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 88cffd55..6c483509 100644 --- a/Makefile +++ b/Makefile @@ -21,4 +21,4 @@ data/registries.db: $(DATAFILES) sqlite-utils insert $@ formats --nl data/loc.jsonl sqlite-utils insert $@ formats --nl data/nara.jsonl sqlite-utils insert $@ formats --nl data/tcdb.jsonl - sqlite-utils enable-fts $@ formats name summary extensions iana_media_types genres \ No newline at end of file + sqlite-utils enable-fts $@ formats name summary extensions iana_media_types genres additional_fields \ No newline at end of file diff --git a/README.md b/README.md index dce7756f..1e07311c 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,8 @@ To Do * http://en.wikipedia.org/wiki/Alphabetical_list_of_filename_extensions_%28M%E2%80%93R%29 * http://www.webarchive.org.uk/aadda-discovery/formats?f[0]=content_type_ext:%22.bmp%22 * https://twitter.com/benfinoradin/status/532212803630039041 +* Talk about how to use `git submodule update --recursive --remote` to make sure `pywikibot` and `digipres.github.io` are up to date. +* Using `uvx datasette serve data/registries.db` to quickly poke around in the database. COPTR Bot --------- From 5e687337c65e7c79dd1bb369110f48a721a41609 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 13 Feb 2025 14:47:21 +0000 Subject: [PATCH 14/53] Update submodules. --- digipres.github.io | 2 +- pywikibot | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/digipres.github.io b/digipres.github.io index c568e23d..7d0a158a 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit c568e23d7f9f2f7a20115ba90f255f14494c98ec +Subproject commit 7d0a158a697769afa5482e3c929fc968e070ec3d diff --git a/pywikibot b/pywikibot index c143841e..042c4329 160000 --- a/pywikibot +++ b/pywikibot @@ -1 +1 @@ -Subproject commit c143841e21d2cc753036ea19e30b2eb38ccbda48 +Subproject commit 042c4329d0cd6b963848ca2c09b4c59e329b10d5 From 618d577abeba4f0837f375347eb1eee2a7b263bf Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 13 Feb 2025 22:14:17 +0000 Subject: [PATCH 15/53] Add initial WikiData processor and try workflowing it. --- .github/workflows/data-update.yml | 3 + Makefile | 16 ++-- foreging/models.py | 32 ++++--- foreging/tcdb.py | 76 +++++++++------ foreging/wikidata.py | 149 ++++++++++++++++++++++++++++++ setup.sh | 6 +- 6 files changed, 228 insertions(+), 54 deletions(-) create mode 100644 foreging/wikidata.py diff --git a/.github/workflows/data-update.yml b/.github/workflows/data-update.yml index 8a61b9ad..99106cb9 100644 --- a/.github/workflows/data-update.yml +++ b/.github/workflows/data-update.yml @@ -34,6 +34,9 @@ jobs: - name: Update from various data sources... run: ./update.sh + - name: Generate derivatives... + run: make + - name: Deploy updated site... run: ./deploy.sh env: diff --git a/Makefile b/Makefile index 6c483509..7369606a 100644 --- a/Makefile +++ b/Makefile @@ -1,24 +1,28 @@ -DATAFILES := data/pronom.jsonl data/loc.jsonl data/nara.jsonl data/tcdb.jsonl +DATAFILES := data/pronom.jsonl data/loc.jsonl data/nara.jsonl data/tcdb.jsonl data/wikidata.jsonl all: data/registries.db -data/pronom.jsonl: foreging digipres.github.io/_sources/registries +data/pronom.jsonl: foreging/pronom.py digipres.github.io/_sources/registries python -m foreging.pronom > $@ -data/loc.jsonl: foreging digipres.github.io/_sources/registries +data/loc.jsonl: foreging/loc_fdd.py digipres.github.io/_sources/registries python -m foreging.loc_fdd > $@ -data/nara.jsonl: foreging digipres.github.io/_sources/registries +data/nara.jsonl: foreging/nara.py digipres.github.io/_sources/registries python -m foreging.nara > $@ -data/tcdb.jsonl: foreging digipres.github.io/_sources/registries +data/tcdb.jsonl: foreging/tcdb.py digipres.github.io/_sources/registries python -m foreging.tcdb > $@ +data/wikidata.jsonl: foreging/wikidata.py digipres.github.io/_sources/registries + python -m foreging.wikidata > $@ + data/registries.db: $(DATAFILES) rm -f $@ sqlite-utils insert $@ formats --nl data/pronom.jsonl sqlite-utils insert $@ formats --nl data/loc.jsonl sqlite-utils insert $@ formats --nl data/nara.jsonl sqlite-utils insert $@ formats --nl data/tcdb.jsonl - sqlite-utils enable-fts $@ formats name summary extensions iana_media_types genres additional_fields \ No newline at end of file + sqlite-utils insert $@ formats --nl data/wikidata.jsonl + sqlite-utils enable-fts $@ formats name summary extensions iana_media_types genres readers writers additional_fields \ No newline at end of file diff --git a/foreging/models.py b/foreging/models.py index f866cc8c..46b84630 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -1,6 +1,20 @@ from typing import List, Optional, Set, Dict from pydantic import BaseModel, AnyHttpUrl, PastDate +# Pydantic data model for information about software and the formats the software may read or write: +class Software(BaseModel): + registry_id: str + id: str + name: str + version: Optional[str] = None + summary: Optional[str] = None + registry_url: AnyHttpUrl + # + license: Optional[str] = None + # + writes: List[str] = [] + reads: List[str] = [] + # Pydantic data model for partially normalised/star-schema format registry entries: class Format(BaseModel): registry_id: str @@ -14,7 +28,7 @@ class Format(BaseModel): created: Optional[PastDate] = None last_modified: Optional[PastDate] = None # A spot of any additional fields: - additional_fields: Optional[Dict[str,List[str]]] + additional_fields: Optional[Dict[str,List[str]]] = None # Fields relating to format: genres: List[str] = [] extensions: List[str] = [] @@ -22,17 +36,7 @@ class Format(BaseModel): has_magic: bool primary_media_type: Optional[str] parent_media_type: Optional[str] + # Nested fields relating to software: + readers: List[Software] = [] + writers: List[Software] = [] -# Pydantic data model for information about software and the formats the software may read or write: -class Software(BaseModel): - registry_id: str - id: str - name: str - version: Optional[str] - summary: str - registry_url: AnyHttpUrl - # - license - # Relationships to Formats: - can_read: List[Format] = [] - can_write: List[Format] = [] diff --git a/foreging/tcdb.py b/foreging/tcdb.py index f206cc41..1760016e 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -1,7 +1,7 @@ import os import csv import logging -from .models import Format +from .models import Format, Software logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -12,51 +12,69 @@ # class TCDB(): registry_id = "tcdb" + registry_url = f"https://github.com/thorsted/Born-Digital-Scripts/tree/main/TC%20Identification" source_file = 'digipres.github.io/_sources/registries/tcdb/TCDB_2003.8_data-cleaned.csv' warnings = [] def get_formats(self): logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) + # First, gather rows by type_code... + rows_by_type_code = {} # Open, coping with Unicode BOM with open(self.source_file, "r", encoding='utf-8-sig') as csv_file: reader = csv.DictReader(csv_file) for row in reader: logger.debug(f"Processing row: {row}") type_code = row['Type'].strip() + if type_code not in rows_by_type_code: + rows_by_type_code[type_code] = [] + rows_by_type_code[type_code].append(row) + + # Now, process each type_code: + for type_code, rows in rows_by_type_code.items(): + readers = [] + extensions = [] + categories = [] + names = [] + for row in rows: + logger.debug(f"Processing row: {row}") creator_code = row['Creator'].strip() - extension = row['Extension'].strip() - category = row['Category'].strip() - name = row['File Name'].strip() - comments = row['Comments'].strip() - # Store additional fields: - additionals = { - 'mac-type-code': [type_code], - 'mac-creator-code': [creator_code], - 'comments': [comments] - } - # Set up as a format entity: - f = Format( + extensions.append(row['Extension'].strip()) + categories.append(row['Category'].strip()) + names.append(row['File Name'].strip()) + s = Software( registry_id=self.registry_id, id=f"tcdb:{type_code}:{creator_code}", - name=name, + name=row['File Name'].strip(), version=None, - summary=None, - genres=[category] if category else [], - extensions=[extension] if extension else [], - iana_media_types=[], - has_magic=False, - primary_media_type=None, - parent_media_type=None, - registry_url=f"https://github.com/thorsted/Born-Digital-Scripts/tree/main/TC%20Identification", - registry_source_data_url=f"https://github.com/thorsted/Born-Digital-Scripts/blob/main/TC%20Identification/TCDB_2003.8_data-cleaned.csv", - registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/tcdb/TCDB_2003.8_data-cleaned.csv", - additional_fields=additionals, - created=None, - last_modified=None, + summary=row['Comments'].strip(), + registry_url=self.registry_url, + reads=[f"tcdb:{type_code}"] ) - logger.debug(f"Generated format: {f}") - yield f + readers.append(s) + # Set up as a format entity: + f = Format( + registry_id=self.registry_id, + id=f"tcdb:{type_code}", + name= ", ".join(names), + version=None, + summary=None, + genres=[x for x in categories if x], + extensions=[x.lower() for x in extensions if x], + iana_media_types=[], + has_magic=False, + primary_media_type=None, + parent_media_type=None, + registry_url=self.registry_url, + registry_source_data_url=f"https://github.com/thorsted/Born-Digital-Scripts/blob/main/TC%20Identification/TCDB_2003.8_data-cleaned.csv", + registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/tcdb/TCDB_2003.8_data-cleaned.csv", + created=None, + last_modified=None, + readers=readers, + ) + logger.debug(f"Generated format: {f}") + yield f if __name__ == "__main__": diff --git a/foreging/wikidata.py b/foreging/wikidata.py new file mode 100644 index 00000000..0f1e6b81 --- /dev/null +++ b/foreging/wikidata.py @@ -0,0 +1,149 @@ +from collections import defaultdict +import os +import json +import logging +from .models import Format, Software +import argparse + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# +# +# WikiData dumps parser +# +class WikiData(): + registry_id = "wikidata" + source_file_dir = "digipres.github.io/_sources/registries/wikidata" + fmt_source_file = f"{source_file_dir}/wikidata.json" + sw_r_source_file = f"{source_file_dir}/wikidata-reads.json" + sw_w_source_file = f"{source_file_dir}/wikidata-writes.json" + warnings = [] + + def get_formats(self): + logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) + + with open (self.fmt_source_file, 'r') as f: + wd = json.load(f) + + fmts = {} + + current_qid = None + + for fmt in wd: + qid = f"wikidata:{fmt['id']}" + # items are ordered by ID, so we can aggregate as we go + if qid != current_qid: + # Store the previous record: + if current_qid: + fmts[current_qid] = finfo + current_qid = qid + # Start a new record: + finfo = {} + finfo['name'] = fmt['name'] + finfo['source'] = fmt['source'] + finfo['extensions'] = set() + finfo['mimetypes'] = set() + finfo['hasMagic'] = False + finfo['readers'] = [] + finfo['writers'] = [] + # Aggregate value for each ID + for key in fmt: + if key == 'extension' and fmt[key]: + finfo['extensions'].add(fmt[key]) + if key == 'mimetype' and fmt[key]: + finfo['mimetypes'].add(fmt[key]) + if key == 'sig' and fmt[key]: + finfo['hasMagic'] = True + + # Add the final one: + if current_qid: + fmts[current_qid] = finfo + + # Now get the software: + + # Load the 'what reads this' and 'what writes this' data: + with open (self.sw_r_source_file, 'r') as f: + sw_r = json.load(f) + with open (self.sw_w_source_file, 'r') as f: + sw_w = json.load(f) + + # Process the software data: + sws = {} + for mode, sw_i in [('reads', sw_r), ('writes', sw_w)]: + for sw in sw_i: + qid = sw['format'].replace("http://www.wikidata.org/entity/","wikidata:") + if qid not in fmts: + logger.warning(f"Software entry for unknown format {qid}: {sw['formatLabel']}") + self.warnings.append(f"Software entry for unknown format {qid}: {sw['formatLabel']}") + continue + sw_qid = sw['id'] + if sw_qid not in sws: + sws[sw_qid] = sw + sws[sw_qid]['reads'] = [] + sws[sw_qid]['writes'] = [] + sws[sw_qid][mode].append(qid) + + # Now add the software to the formats: + for sw in sws.values(): + s = self.make_software(sw) + for qid in sw['reads']: + fmts[qid]['readers'].append(s) + for qid in sw['writes']: + fmts[qid]['writers'].append(s) + + + # And return the format: + for qid in fmts: + info = fmts[qid] + yield self.make_format(qid,info) + + + def make_format(self, current_qid, finfo): + + # Set up as a format entity: + f = Format( + registry_id=self.registry_id, + id=f"wikidata:{current_qid}", + name=finfo['name'], + version=None, + summary=None, + genres= [], + extensions=list(finfo['extensions']), + iana_media_types=list(finfo['mimetypes']), + has_magic=finfo['hasMagic'], + primary_media_type=None, + parent_media_type=None, + registry_url=finfo['source'], + registry_source_data_url=f"{finfo['source']}.jsonld", + registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{self.fmt_source_file}", + additional_fields={}, + created=None, + last_modified=None, + readers=finfo['readers'], + writers=finfo['writers'] + ) + logger.debug(f"Generated format: {f}") + return f + + + def make_software(self, info): + s = Software( + registry_id=self.registry_id, + id=f"wikidata:{info['id']}", + name=info['name'], + version=None, + summary=None, + registry_url=info['source'], + license=info['licenseLabel'], + reads=info['reads'], + writes=info['writes'] + ) + logger.debug(f"Generated software: {s}") + return s + + +if __name__ == "__main__": + gen = WikiData() + for f in gen.get_formats(): + print(f.model_dump_json()) diff --git a/setup.sh b/setup.sh index e4efb73f..131a3632 100755 --- a/setup.sh +++ b/setup.sh @@ -15,11 +15,7 @@ cd pywikibot pip install . cd .. -pip install requests -pip install pyyaml -pip install beautifulsoup4 -pip install lxml -pip install rdflib +pip install . # Running... echo "And login..." From 8b2fdbbd1eea250a2dab3950366ff341bae14271 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 13 Feb 2025 22:23:51 +0000 Subject: [PATCH 16/53] Use a script instead. --- .github/workflows/data-update.yml | 2 +- derive.sh | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100755 derive.sh diff --git a/.github/workflows/data-update.yml b/.github/workflows/data-update.yml index 99106cb9..e37a126e 100644 --- a/.github/workflows/data-update.yml +++ b/.github/workflows/data-update.yml @@ -35,7 +35,7 @@ jobs: run: ./update.sh - name: Generate derivatives... - run: make + run: ./derive.sh - name: Deploy updated site... run: ./deploy.sh diff --git a/derive.sh b/derive.sh new file mode 100755 index 00000000..c2eb3d93 --- /dev/null +++ b/derive.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +source venv/bin/activate + +make + +cp data/registries.db digipres.github.io/_data/formats/ + From 0ab49f00b467ba42b2a6a35da6ef8735dbda29e5 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 13 Feb 2025 23:29:05 +0000 Subject: [PATCH 17/53] Repair aggregator script. --- aggregates.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/aggregates.py b/aggregates.py index ee28feb8..6c4adefc 100644 --- a/aggregates.py +++ b/aggregates.py @@ -86,7 +86,50 @@ def addFormat(rid,fid,finfo): # And add: fmts[rid]['formats'][fid] = finfo +def aggregateFDD(): + rid = "fdd" + print("Parsing %s..." % rid) + for filename in os.listdir('digipres.github.io/_sources/registries/fdd/fddXML'): + if filename.endswith(".xml"): + print(f"Parsing {filename}...") + # Get Identifier? + with open('digipres.github.io/_sources/registries/fdd/fddXML/'+filename, "rb") as f: + finfo = {} + finfo['source'] = filename + xml = f.read() + root = None + try: + #parser = etree.XMLParser() + #root = etree.parse(BytesIO(xml), parser) + root = BeautifulSoup(xml, "xml") + ffd_id = root.find('FDD').get('id') + finfo['name'] = root.find('FDD').get('titleName') + if root.find('magicNumbers'): + finfo['hasMagic'] = True + else: + finfo['hasMagic'] = False + # Get extensions: + extensions = list() + for fe in root.findAll('filenameExtension'): + for fev in fe.findAll('sigValue'): + extensions.append("*.%s" % fev.text) + finfo['extensions'] = extensions + # Get MIME types: + mimetypes = list() + for imts in root.findAll('internetMediaType'): + for mt in imts.findAll('sigValue'): + mimetypes.append(mt.text) + finfo['mimetypes'] = mimetypes + addFormat(rid,ffd_id,finfo) + except Exception as e: + print(f"Parsing {filename} failed: {e}") + if root: + print("XML parsed as:") + print(root.prettify()) + #print(etree.tostring(root, pretty_print=True).decode('utf-8')) + if rid in fmts: # FIXME this needs to be more robust, rather than relying on happening after 'addFormat' is called for the first time. + fmts[rid]['warnings'].append(f"Error when parsing XML from '{filename}': {e}") def aggregateTRiD(): From 3bfbd7aac378c2880f9565b763340db54910fe82 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 14 Feb 2025 07:25:47 +0000 Subject: [PATCH 18/53] Make the dir first. --- .gitignore | 1 - Makefile | 15 +++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 5b183b5f..4292fdd5 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,5 @@ /passwordfile *.pyc /registries.db -/data /.venv /build diff --git a/Makefile b/Makefile index 7369606a..b39107fe 100644 --- a/Makefile +++ b/Makefile @@ -3,22 +3,25 @@ DATAFILES := data/pronom.jsonl data/loc.jsonl data/nara.jsonl data/tcdb.jsonl da all: data/registries.db -data/pronom.jsonl: foreging/pronom.py digipres.github.io/_sources/registries +data: + mkdir -p data + +data/pronom.jsonl: data foreging/pronom.py digipres.github.io/_sources/registries python -m foreging.pronom > $@ -data/loc.jsonl: foreging/loc_fdd.py digipres.github.io/_sources/registries +data/loc.jsonl: data foreging/loc_fdd.py digipres.github.io/_sources/registries python -m foreging.loc_fdd > $@ -data/nara.jsonl: foreging/nara.py digipres.github.io/_sources/registries +data/nara.jsonl: data foreging/nara.py digipres.github.io/_sources/registries python -m foreging.nara > $@ -data/tcdb.jsonl: foreging/tcdb.py digipres.github.io/_sources/registries +data/tcdb.jsonl: data foreging/tcdb.py digipres.github.io/_sources/registries python -m foreging.tcdb > $@ -data/wikidata.jsonl: foreging/wikidata.py digipres.github.io/_sources/registries +data/wikidata.jsonl: data foreging/wikidata.py digipres.github.io/_sources/registries python -m foreging.wikidata > $@ -data/registries.db: $(DATAFILES) +data/registries.db: data $(DATAFILES) rm -f $@ sqlite-utils insert $@ formats --nl data/pronom.jsonl sqlite-utils insert $@ formats --nl data/loc.jsonl From 2cb2cb0748f55aea86db88f9f27193581af1ae45 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 14 Feb 2025 13:43:03 +0000 Subject: [PATCH 19/53] Core working WikiData-to-SQLite engine. --- Makefile | 16 ++++---- digipres.github.io | 2 +- foreging/tcdb.py | 10 ++--- foreging/wikidata.py | 95 ++++++++++++++++++++++++++++---------------- 4 files changed, 74 insertions(+), 49 deletions(-) diff --git a/Makefile b/Makefile index b39107fe..2f6c0828 100644 --- a/Makefile +++ b/Makefile @@ -1,30 +1,30 @@ DATAFILES := data/pronom.jsonl data/loc.jsonl data/nara.jsonl data/tcdb.jsonl data/wikidata.jsonl -all: data/registries.db +all: data data/registries.db data: mkdir -p data -data/pronom.jsonl: data foreging/pronom.py digipres.github.io/_sources/registries +data/pronom.jsonl: foreging/pronom.py digipres.github.io/_sources/registries python -m foreging.pronom > $@ -data/loc.jsonl: data foreging/loc_fdd.py digipres.github.io/_sources/registries +data/loc_fdd.jsonl: foreging/loc_fdd.py digipres.github.io/_sources/registries python -m foreging.loc_fdd > $@ -data/nara.jsonl: data foreging/nara.py digipres.github.io/_sources/registries +data/nara.jsonl: foreging/nara.py digipres.github.io/_sources/registries python -m foreging.nara > $@ -data/tcdb.jsonl: data foreging/tcdb.py digipres.github.io/_sources/registries +data/tcdb.jsonl: foreging/tcdb.py digipres.github.io/_sources/registries python -m foreging.tcdb > $@ -data/wikidata.jsonl: data foreging/wikidata.py digipres.github.io/_sources/registries +data/wikidata.jsonl: foreging/wikidata.py digipres.github.io/_sources/registries python -m foreging.wikidata > $@ -data/registries.db: data $(DATAFILES) +data/registries.db: $(DATAFILES) rm -f $@ sqlite-utils insert $@ formats --nl data/pronom.jsonl - sqlite-utils insert $@ formats --nl data/loc.jsonl + sqlite-utils insert $@ formats --nl data/loc_fdd.jsonl sqlite-utils insert $@ formats --nl data/nara.jsonl sqlite-utils insert $@ formats --nl data/tcdb.jsonl sqlite-utils insert $@ formats --nl data/wikidata.jsonl diff --git a/digipres.github.io b/digipres.github.io index 7d0a158a..64c8237d 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 7d0a158a697769afa5482e3c929fc968e070ec3d +Subproject commit 64c8237de0f9866fe80d20c239ba9d694e4b381f diff --git a/foreging/tcdb.py b/foreging/tcdb.py index 1760016e..f13a923d 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -34,14 +34,14 @@ def get_formats(self): # Now, process each type_code: for type_code, rows in rows_by_type_code.items(): readers = [] - extensions = [] - categories = [] + extensions = set() + categories = set() names = [] for row in rows: logger.debug(f"Processing row: {row}") creator_code = row['Creator'].strip() - extensions.append(row['Extension'].strip()) - categories.append(row['Category'].strip()) + extensions.add(row['Extension'].strip()) + categories.add(row['Category'].strip()) names.append(row['File Name'].strip()) s = Software( registry_id=self.registry_id, @@ -57,7 +57,7 @@ def get_formats(self): f = Format( registry_id=self.registry_id, id=f"tcdb:{type_code}", - name= ", ".join(names), + name= ", ".join(names)[:256], # FIXME Limit size as this includes too much software information and is very slow to work with! version=None, summary=None, genres=[x for x in categories if x], diff --git a/foreging/wikidata.py b/foreging/wikidata.py index 0f1e6b81..aa0d1d2d 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -2,23 +2,25 @@ import os import json import logging -from .models import Format, Software +from .models_sql import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from sqlmodel import Field, Relationship, Session, SQLModel, create_engine import argparse logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -# + # # WikiData dumps parser # class WikiData(): registry_id = "wikidata" + registry = Registry(id=registry_id, name="WikiData") + source_file_dir = "digipres.github.io/_sources/registries/wikidata" fmt_source_file = f"{source_file_dir}/wikidata.json" sw_r_source_file = f"{source_file_dir}/wikidata-reads.json" sw_w_source_file = f"{source_file_dir}/wikidata-writes.json" - warnings = [] def get_formats(self): logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) @@ -27,6 +29,9 @@ def get_formats(self): wd = json.load(f) fmts = {} + exts = {} + mts = {} + warnings = set() current_qid = None @@ -45,14 +50,19 @@ def get_formats(self): finfo['extensions'] = set() finfo['mimetypes'] = set() finfo['hasMagic'] = False - finfo['readers'] = [] - finfo['writers'] = [] + finfo['readers'] = set() + finfo['writers'] = set() # Aggregate value for each ID for key in fmt: if key == 'extension' and fmt[key]: - finfo['extensions'].add(fmt[key]) + # Making sure we reuse the same object for an extension to keep the model consistent: + ext = fmt[key] + exts[ext] = exts.get(ext, Extension(id=ext)) + finfo['extensions'].add(exts[ext]) if key == 'mimetype' and fmt[key]: - finfo['mimetypes'].add(fmt[key]) + mt = fmt[key] + mts[mt] = mts.get(mt, MediaType(id=mt)) + finfo['mimetypes'].add(mts[mt]) if key == 'sig' and fmt[key]: finfo['hasMagic'] = True @@ -73,11 +83,13 @@ def get_formats(self): for mode, sw_i in [('reads', sw_r), ('writes', sw_w)]: for sw in sw_i: qid = sw['format'].replace("http://www.wikidata.org/entity/","wikidata:") + sw_qid = sw['id'] + # Check it's in the set: if qid not in fmts: - logger.warning(f"Software entry for unknown format {qid}: {sw['formatLabel']}") - self.warnings.append(f"Software entry for unknown format {qid}: {sw['formatLabel']}") + warning = f"Software entry '{sw_qid}: {sw['formatLabel']}' references missing format '{qid}'" + logger.warning( warning ) + warnings.add( RegistryDataLogEntry(level="warning", message=warning, url=sw['source'] ) ) continue - sw_qid = sw['id'] if sw_qid not in sws: sws[sw_qid] = sw sws[sw_qid]['reads'] = [] @@ -88,10 +100,12 @@ def get_formats(self): for sw in sws.values(): s = self.make_software(sw) for qid in sw['reads']: - fmts[qid]['readers'].append(s) + fmts[qid]['readers'].add(s) for qid in sw['writes']: - fmts[qid]['writers'].append(s) + fmts[qid]['writers'].add(s) + # Store the warnings: + self.registry.data_log = list(warnings) # And return the format: for qid in fmts: @@ -103,25 +117,25 @@ def make_format(self, current_qid, finfo): # Set up as a format entity: f = Format( - registry_id=self.registry_id, - id=f"wikidata:{current_qid}", + id=f"{current_qid}", + registry=self.registry, name=finfo['name'], version=None, summary=None, genres= [], extensions=list(finfo['extensions']), - iana_media_types=list(finfo['mimetypes']), + media_types=list(finfo['mimetypes']), has_magic=finfo['hasMagic'], - primary_media_type=None, - parent_media_type=None, - registry_url=finfo['source'], - registry_source_data_url=f"{finfo['source']}.jsonld", - registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{self.fmt_source_file}", - additional_fields={}, - created=None, - last_modified=None, - readers=finfo['readers'], - writers=finfo['writers'] + #primary_media_type=None, + #parent_media_type=None, + #registry_url=finfo['source'], + #registry_source_data_url=f"{finfo['source']}.jsonld", + #registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{self.fmt_source_file}", + #additional_fields={}, + #created=None, + #last_modified=None, + readers=list(finfo['readers']), + writers=list(finfo['writers']) ) logger.debug(f"Generated format: {f}") return f @@ -129,21 +143,32 @@ def make_format(self, current_qid, finfo): def make_software(self, info): s = Software( - registry_id=self.registry_id, + #registry_id=self.registry_id, id=f"wikidata:{info['id']}", name=info['name'], - version=None, - summary=None, - registry_url=info['source'], - license=info['licenseLabel'], - reads=info['reads'], - writes=info['writes'] + #version=None, + #summary=None, + #registry_url=info['source'], + #license=info['licenseLabel'], ) logger.debug(f"Generated software: {s}") return s if __name__ == "__main__": - gen = WikiData() - for f in gen.get_formats(): - print(f.model_dump_json()) + sqlite_file_name = "database.db" + sqlite_url = f"sqlite:///{sqlite_file_name}" + + engine = create_engine(sqlite_url, echo=False) + + SQLModel.metadata.create_all(engine) + + with Session(engine) as session: + + gen = WikiData() + i = 0 + for f in gen.get_formats(): + session.add(f) + i += 1 + if i % 100 == 0: + session.commit() From 47f387aa50283eb0dda07f79b3320ab89cf4a143 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 14 Feb 2025 14:00:13 +0000 Subject: [PATCH 20/53] Added the other fields. --- foreging/wikidata.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/foreging/wikidata.py b/foreging/wikidata.py index aa0d1d2d..df2e336f 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -14,14 +14,21 @@ # WikiData dumps parser # class WikiData(): - registry_id = "wikidata" - registry = Registry(id=registry_id, name="WikiData") - source_file_dir = "digipres.github.io/_sources/registries/wikidata" fmt_source_file = f"{source_file_dir}/wikidata.json" sw_r_source_file = f"{source_file_dir}/wikidata-reads.json" sw_w_source_file = f"{source_file_dir}/wikidata-writes.json" + # Set up the Registry object for this class: + registry_id = "wikidata" + registry = Registry( + id=registry_id, + name="WikiData", + url="https://www.wikidata.org/wiki/Wikidata:WikiProject_Informatics/Structures/File_formats", + index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{source_file_dir}" + ) + + def get_formats(self): logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) @@ -126,14 +133,14 @@ def make_format(self, current_qid, finfo): extensions=list(finfo['extensions']), media_types=list(finfo['mimetypes']), has_magic=finfo['hasMagic'], - #primary_media_type=None, - #parent_media_type=None, - #registry_url=finfo['source'], - #registry_source_data_url=f"{finfo['source']}.jsonld", - #registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{self.fmt_source_file}", + primary_media_type=None, + parent_media_type=None, + registry_url=finfo['source'], + registry_source_data_url=f"{finfo['source']}.jsonld", + registry_index_data_url=None, #additional_fields={}, - #created=None, - #last_modified=None, + created=None, + last_modified=None, readers=list(finfo['readers']), writers=list(finfo['writers']) ) @@ -143,13 +150,13 @@ def make_format(self, current_qid, finfo): def make_software(self, info): s = Software( - #registry_id=self.registry_id, + registry_id=self.registry_id, id=f"wikidata:{info['id']}", name=info['name'], - #version=None, - #summary=None, - #registry_url=info['source'], - #license=info['licenseLabel'], + version=None, + summary=None, + registry_url=info['source'], + license=info['licenseLabel'], ) logger.debug(f"Generated software: {s}") return s From a0977cc3881c5e32b87acba2a8f0d4a1ceaed1df Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 14 Feb 2025 14:00:54 +0000 Subject: [PATCH 21/53] Added actual SQLModel implementation. --- foreging/models_sql.py | 168 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 foreging/models_sql.py diff --git a/foreging/models_sql.py b/foreging/models_sql.py new file mode 100644 index 00000000..8caf0809 --- /dev/null +++ b/foreging/models_sql.py @@ -0,0 +1,168 @@ +from typing import List, Optional, Set, Dict +from pydantic import BaseModel, AnyHttpUrl, PastDate +from datetime import datetime +from sqlmodel import Field, Relationship, Session, SQLModel, create_engine + + +class Registry(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + url: str | None = Field() + index_data_url: str | None = Field() + + data_log: list["RegistryDataLogEntry"] = Relationship() + + +class RegistryDataLogEntry(SQLModel, table=True): + __tablename__ = 'registry_data_log' + id: int | None = Field(default=None, primary_key=True) + level: str = Field(index=True) + message: str = Field() + url: str | None = Field() + + registry_id: str | None = Field(default=None, foreign_key="registry.id") + registry: Registry | None = Relationship(back_populates="data_log") + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.message) + def __eq__(self,other): + return self.message == other.message + +class SoftwareReadsFormatLink(SQLModel, table=True): + __tablename__ = "formats_read_by_software" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) + +class SoftwareWritesFormatLink(SQLModel, table=True): + __tablename__ = "formats_written_by_software" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) + +class Software(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + version: str | None = Field(index=True) + summary: str | None = Field(index=True) + licensed: str | None = Field(index=True) + registry_url: str | None = Field(index=True) + + reads: list["Format"] = Relationship(back_populates="readers", link_model=SoftwareReadsFormatLink) + writes: list["Format"] = Relationship(back_populates="writers", link_model=SoftwareWritesFormatLink) + + registry_id: str | None = Field(default=None, foreign_key="registry.id") + registry: Registry | None = Relationship() + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.id) + def __eq__(self,other): + return self.id == other.id + +class FormatGenresLink(SQLModel, table=True): + __tablename__ = "format_genres" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + genre_id: str | None = Field(default=None, foreign_key="genre.id", primary_key=True) + +class Genre(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + # + formats: list["Format"] = Relationship(back_populates="genres", link_model=FormatGenresLink) + +class ExtensionFormatsLink(SQLModel, table=True): + __tablename__ = "format_extensions" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + extension_id: str | None = Field(default=None, foreign_key="extension.id", primary_key=True) + +class Extension(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + # + formats: list["Format"] = Relationship(back_populates="extensions", link_model=ExtensionFormatsLink) + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.id) + def __eq__(self,other): + return self.id == other.id + +class MediaTypesFormatsLink(SQLModel, table=True): + __tablename__ = "format_media_types" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + media_type_id: str | None = Field(default=None, foreign_key="media_type.id", primary_key=True) + +class MediaType(SQLModel, table=True): + __tablename__ = "media_type" + id: str | None = Field(default=None, primary_key=True) + # + formats: list["Format"] = Relationship(back_populates="media_types", link_model=MediaTypesFormatsLink) + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.id) + def __eq__(self,other): + return self.id == other.id + +class Format(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + version: str | None = Field(index=True) + summary: str | None = Field(index=True) + genres: list["Genre"] = Relationship(back_populates="formats", link_model=FormatGenresLink) + extensions: list["Extension"] = Relationship(back_populates="formats", link_model=ExtensionFormatsLink) + media_types: list["MediaType"] = Relationship(back_populates="formats", link_model=MediaTypesFormatsLink) + has_magic: bool = Field(default=False) + primary_media_type: str | None = Field(index=True) + parent_media_type: str | None = Field(index=True) + registry_url: str | None = Field(index=True) + registry_source_data_url: str | None = Field(index=True) + registry_index_data_url: str | None = Field(index=True) + created: datetime | None = Field(index=True) + last_modified: datetime | None = Field(index=True) + + readers: list["Software"] = Relationship(back_populates="reads", link_model=SoftwareReadsFormatLink) + writers: list["Software"] = Relationship(back_populates="writes", link_model=SoftwareWritesFormatLink) + + registry_id: str | None = Field(default=None, foreign_key="registry.id") + registry: Registry | None = Relationship() + + +# Pydantic data model for information about software and the formats the software may read or write: +class PydanticSoftware(BaseModel): + registry_id: str + id: str + name: str + version: Optional[str] = None + summary: Optional[str] = None + registry_url: AnyHttpUrl + # + license: Optional[str] = None + # + writes: List[str] = [] + reads: List[str] = [] + +# Pydantic data model for partially normalised/star-schema format registry entries: +class PydanticFormat(BaseModel): + registry_id: str + id: str + name: str + version: Optional[str] + summary: Optional[str] = None + registry_url: AnyHttpUrl + registry_source_data_url: AnyHttpUrl + registry_index_data_url: Optional[AnyHttpUrl] + created: Optional[PastDate] = None + last_modified: Optional[PastDate] = None + # A spot of any additional fields: + additional_fields: Optional[Dict[str,List[str]]] = None + # Fields relating to format: + genres: List[str] = [] + extensions: List[str] = [] + iana_media_types: List[str] = [] + has_magic: bool + primary_media_type: Optional[str] + parent_media_type: Optional[str] + # Nested fields relating to software: + readers: List[Software] = [] + writers: List[Software] = [] + From f51d14300f62c0163b4bf7720a096397e4796bf6 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 14 Feb 2025 15:18:42 +0000 Subject: [PATCH 22/53] Three now working. --- foreging/models.py | 170 +++++++++++++++++++++++++++++++---------- foreging/models_sql.py | 168 ---------------------------------------- foreging/populate.py | 37 +++++++++ foreging/pronom.py | 28 +++++-- foreging/tcdb.py | 73 ++++++++++-------- foreging/wikidata.py | 27 +------ 6 files changed, 232 insertions(+), 271 deletions(-) delete mode 100644 foreging/models_sql.py create mode 100644 foreging/populate.py diff --git a/foreging/models.py b/foreging/models.py index 46b84630..9d7345e4 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -1,42 +1,132 @@ -from typing import List, Optional, Set, Dict -from pydantic import BaseModel, AnyHttpUrl, PastDate - -# Pydantic data model for information about software and the formats the software may read or write: -class Software(BaseModel): - registry_id: str - id: str - name: str - version: Optional[str] = None - summary: Optional[str] = None - registry_url: AnyHttpUrl - # - license: Optional[str] = None +from datetime import date +from sqlmodel import Field, Relationship, Session, SQLModel, create_engine + + +class Registry(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + url: str | None = Field() + index_data_url: str | None = Field() + + data_log: list["RegistryDataLogEntry"] = Relationship() + + +class RegistryDataLogEntry(SQLModel, table=True): + __tablename__ = 'registry_data_log' + id: int | None = Field(default=None, primary_key=True) + level: str = Field(index=True) + message: str = Field() + url: str | None = Field() + + registry_id: str | None = Field(default=None, foreign_key="registry.id") + registry: Registry | None = Relationship(back_populates="data_log") + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.message) + def __eq__(self,other): + return self.message == other.message + +class SoftwareReadsFormatLink(SQLModel, table=True): + __tablename__ = "formats_read_by_software" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) + +class SoftwareWritesFormatLink(SQLModel, table=True): + __tablename__ = "formats_written_by_software" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) + +class Software(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + version: str | None = Field(index=True) + summary: str | None = Field(index=True) + license: str | None = Field(index=True) + registry_url: str | None = Field(index=True) + + reads: list["Format"] = Relationship(back_populates="readers", link_model=SoftwareReadsFormatLink) + writes: list["Format"] = Relationship(back_populates="writers", link_model=SoftwareWritesFormatLink) + + registry_id: str | None = Field(default=None, foreign_key="registry.id") + registry: Registry | None = Relationship() + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.id) + def __eq__(self,other): + return self.id == other.id + +class FormatGenresLink(SQLModel, table=True): + __tablename__ = "format_genres" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + genre_id: str | None = Field(default=None, foreign_key="genre.id", primary_key=True) + +class Genre(SQLModel, table=True): + id: int | None = Field(default=None, primary_key=True) + name: str = Field(index=True) # - writes: List[str] = [] - reads: List[str] = [] - -# Pydantic data model for partially normalised/star-schema format registry entries: -class Format(BaseModel): - registry_id: str - id: str - name: str - version: Optional[str] - summary: Optional[str] = None - registry_url: AnyHttpUrl - registry_source_data_url: AnyHttpUrl - registry_index_data_url: Optional[AnyHttpUrl] - created: Optional[PastDate] = None - last_modified: Optional[PastDate] = None - # A spot of any additional fields: - additional_fields: Optional[Dict[str,List[str]]] = None - # Fields relating to format: - genres: List[str] = [] - extensions: List[str] = [] - iana_media_types: List[str] = [] - has_magic: bool - primary_media_type: Optional[str] - parent_media_type: Optional[str] - # Nested fields relating to software: - readers: List[Software] = [] - writers: List[Software] = [] + formats: list["Format"] = Relationship(back_populates="genres", link_model=FormatGenresLink) + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.name) + def __eq__(self,other): + return self.name == other.name + +class ExtensionFormatsLink(SQLModel, table=True): + __tablename__ = "format_extensions" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + extension_id: str | None = Field(default=None, foreign_key="extension.id", primary_key=True) + +class Extension(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + # + formats: list["Format"] = Relationship(back_populates="extensions", link_model=ExtensionFormatsLink) + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.id) + def __eq__(self,other): + return self.id == other.id + +class MediaTypesFormatsLink(SQLModel, table=True): + __tablename__ = "format_media_types" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + media_type_id: str | None = Field(default=None, foreign_key="media_type.id", primary_key=True) + +class MediaType(SQLModel, table=True): + __tablename__ = "media_type" + id: str | None = Field(default=None, primary_key=True) + # + formats: list["Format"] = Relationship(back_populates="media_types", link_model=MediaTypesFormatsLink) + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.id) + def __eq__(self,other): + return self.id == other.id + +class Format(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + version: str | None = Field(index=True) + summary: str | None = Field(index=True) + genres: list["Genre"] = Relationship(back_populates="formats", link_model=FormatGenresLink) + extensions: list["Extension"] = Relationship(back_populates="formats", link_model=ExtensionFormatsLink) + media_types: list["MediaType"] = Relationship(back_populates="formats", link_model=MediaTypesFormatsLink) + has_magic: bool = Field(default=False) + primary_media_type: str | None = Field(index=True) + parent_media_type: str | None = Field(index=True) + registry_url: str | None = Field(index=True) + registry_source_data_url: str | None = Field(index=True) + registry_index_data_url: str | None = Field(index=True) + created: date | None = Field(index=True) + last_modified: date | None = Field(index=True) + + readers: list["Software"] = Relationship(back_populates="reads", link_model=SoftwareReadsFormatLink) + writers: list["Software"] = Relationship(back_populates="writes", link_model=SoftwareWritesFormatLink) + + registry_id: str | None = Field(default=None, foreign_key="registry.id") + registry: Registry | None = Relationship() diff --git a/foreging/models_sql.py b/foreging/models_sql.py deleted file mode 100644 index 8caf0809..00000000 --- a/foreging/models_sql.py +++ /dev/null @@ -1,168 +0,0 @@ -from typing import List, Optional, Set, Dict -from pydantic import BaseModel, AnyHttpUrl, PastDate -from datetime import datetime -from sqlmodel import Field, Relationship, Session, SQLModel, create_engine - - -class Registry(SQLModel, table=True): - id: str | None = Field(default=None, primary_key=True) - name: str = Field(index=True) - url: str | None = Field() - index_data_url: str | None = Field() - - data_log: list["RegistryDataLogEntry"] = Relationship() - - -class RegistryDataLogEntry(SQLModel, table=True): - __tablename__ = 'registry_data_log' - id: int | None = Field(default=None, primary_key=True) - level: str = Field(index=True) - message: str = Field() - url: str | None = Field() - - registry_id: str | None = Field(default=None, foreign_key="registry.id") - registry: Registry | None = Relationship(back_populates="data_log") - - # Define how to spot unique entries in a set - def __hash__(self): - return hash(self.message) - def __eq__(self,other): - return self.message == other.message - -class SoftwareReadsFormatLink(SQLModel, table=True): - __tablename__ = "formats_read_by_software" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) - -class SoftwareWritesFormatLink(SQLModel, table=True): - __tablename__ = "formats_written_by_software" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) - -class Software(SQLModel, table=True): - id: str | None = Field(default=None, primary_key=True) - name: str = Field(index=True) - version: str | None = Field(index=True) - summary: str | None = Field(index=True) - licensed: str | None = Field(index=True) - registry_url: str | None = Field(index=True) - - reads: list["Format"] = Relationship(back_populates="readers", link_model=SoftwareReadsFormatLink) - writes: list["Format"] = Relationship(back_populates="writers", link_model=SoftwareWritesFormatLink) - - registry_id: str | None = Field(default=None, foreign_key="registry.id") - registry: Registry | None = Relationship() - - # Define how to spot unique entries in a set - def __hash__(self): - return hash(self.id) - def __eq__(self,other): - return self.id == other.id - -class FormatGenresLink(SQLModel, table=True): - __tablename__ = "format_genres" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - genre_id: str | None = Field(default=None, foreign_key="genre.id", primary_key=True) - -class Genre(SQLModel, table=True): - id: str | None = Field(default=None, primary_key=True) - name: str = Field(index=True) - # - formats: list["Format"] = Relationship(back_populates="genres", link_model=FormatGenresLink) - -class ExtensionFormatsLink(SQLModel, table=True): - __tablename__ = "format_extensions" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - extension_id: str | None = Field(default=None, foreign_key="extension.id", primary_key=True) - -class Extension(SQLModel, table=True): - id: str | None = Field(default=None, primary_key=True) - # - formats: list["Format"] = Relationship(back_populates="extensions", link_model=ExtensionFormatsLink) - - # Define how to spot unique entries in a set - def __hash__(self): - return hash(self.id) - def __eq__(self,other): - return self.id == other.id - -class MediaTypesFormatsLink(SQLModel, table=True): - __tablename__ = "format_media_types" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - media_type_id: str | None = Field(default=None, foreign_key="media_type.id", primary_key=True) - -class MediaType(SQLModel, table=True): - __tablename__ = "media_type" - id: str | None = Field(default=None, primary_key=True) - # - formats: list["Format"] = Relationship(back_populates="media_types", link_model=MediaTypesFormatsLink) - - # Define how to spot unique entries in a set - def __hash__(self): - return hash(self.id) - def __eq__(self,other): - return self.id == other.id - -class Format(SQLModel, table=True): - id: str | None = Field(default=None, primary_key=True) - name: str = Field(index=True) - version: str | None = Field(index=True) - summary: str | None = Field(index=True) - genres: list["Genre"] = Relationship(back_populates="formats", link_model=FormatGenresLink) - extensions: list["Extension"] = Relationship(back_populates="formats", link_model=ExtensionFormatsLink) - media_types: list["MediaType"] = Relationship(back_populates="formats", link_model=MediaTypesFormatsLink) - has_magic: bool = Field(default=False) - primary_media_type: str | None = Field(index=True) - parent_media_type: str | None = Field(index=True) - registry_url: str | None = Field(index=True) - registry_source_data_url: str | None = Field(index=True) - registry_index_data_url: str | None = Field(index=True) - created: datetime | None = Field(index=True) - last_modified: datetime | None = Field(index=True) - - readers: list["Software"] = Relationship(back_populates="reads", link_model=SoftwareReadsFormatLink) - writers: list["Software"] = Relationship(back_populates="writes", link_model=SoftwareWritesFormatLink) - - registry_id: str | None = Field(default=None, foreign_key="registry.id") - registry: Registry | None = Relationship() - - -# Pydantic data model for information about software and the formats the software may read or write: -class PydanticSoftware(BaseModel): - registry_id: str - id: str - name: str - version: Optional[str] = None - summary: Optional[str] = None - registry_url: AnyHttpUrl - # - license: Optional[str] = None - # - writes: List[str] = [] - reads: List[str] = [] - -# Pydantic data model for partially normalised/star-schema format registry entries: -class PydanticFormat(BaseModel): - registry_id: str - id: str - name: str - version: Optional[str] - summary: Optional[str] = None - registry_url: AnyHttpUrl - registry_source_data_url: AnyHttpUrl - registry_index_data_url: Optional[AnyHttpUrl] - created: Optional[PastDate] = None - last_modified: Optional[PastDate] = None - # A spot of any additional fields: - additional_fields: Optional[Dict[str,List[str]]] = None - # Fields relating to format: - genres: List[str] = [] - extensions: List[str] = [] - iana_media_types: List[str] = [] - has_magic: bool - primary_media_type: Optional[str] - parent_media_type: Optional[str] - # Nested fields relating to software: - readers: List[Software] = [] - writers: List[Software] = [] - diff --git a/foreging/populate.py b/foreging/populate.py new file mode 100644 index 00000000..d83a0862 --- /dev/null +++ b/foreging/populate.py @@ -0,0 +1,37 @@ +from .pronom import PRONOM +from .tcdb import TCDB +from .wikidata import WikiData +from sqlmodel import Session, SQLModel, create_engine + + +def populate_database(session, gen, exts, mts, genres): + i = 0 + for f in gen.get_formats(exts, mts, genres): + session.add(f) + i += 1 + if i % 200 == 0: + session.commit() + # And get the last few in: + session.commit() + +if __name__ == "__main__": + + # Cache the cross-referenced entities: + exts = {} + mts = {} + genres = {} + + # Set up the session + sqlite_file_name = "database.db" + sqlite_url = f"sqlite:///{sqlite_file_name}" + + engine = create_engine(sqlite_url, echo=False) + + SQLModel.metadata.create_all(engine) + + with Session(engine) as session: + populate_database(session, PRONOM(), exts, mts, genres) + populate_database(session, WikiData(), exts, mts, genres) + populate_database(session, TCDB(), exts, mts, genres) + + diff --git a/foreging/pronom.py b/foreging/pronom.py index ae604371..228c74d2 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -2,7 +2,7 @@ import logging import datetime from bs4 import BeautifulSoup -from .models import Format +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -12,13 +12,19 @@ class PRONOM(): source_folder = 'digipres.github.io/_sources/registries/pronom/' warnings = [] show_parsed_xml_on_errors = False - + registry = Registry( + id=registry_id, + name="PRONOM", + url="https://www.nationalarchives.gov.uk/PRONOM/", + index_data_url=f"https://github.com/digipres/{source_folder}" + ) + def _date_parser(self, pronom_date): # PRONOM uses '11 Apr 2024' format so this needs parsing here: date = datetime.datetime.strptime(pronom_date, "%d %b %Y") return date - def get_formats(self): + def get_formats(self, exts, mts, genres): logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) for source_folder_name in ['fmt', 'x-fmt']: @@ -26,7 +32,7 @@ def get_formats(self): for filename in os.listdir(source_folder): if filename.endswith(".xml"): - logger.info(f"Parsing {filename}...") + logger.debug(f"Parsing {filename}...") with open(f"{source_folder}/{filename}", "rb") as f: xml = f.read() root = None @@ -43,8 +49,10 @@ def get_formats(self): f_types = [""] # Strip whitespace from genres: f_types = [g.strip() for g in f_types] - # Replace empty strings with "Undefined" + # Replace empty strings with "Undefined": f_types = ['undefined' if not g else g for g in f_types] + # And convert to SQLModel type: + f_types = [Genre(name=g) for g in f_types] # Internal signatures: if root.find('InternalSignature'): f_magic = True @@ -54,13 +62,17 @@ def get_formats(self): extensions = list() for fe in root.findAll('ExternalSignature'): if fe.find('SignatureType', string='File extension'): - extensions.append(fe.find('Signature').text) + ext = fe.find('Signature').text + exts[ext] = exts.get(ext, Extension(id=ext)) + extensions.append(exts[ext]) f_extensions = extensions # Get MIME types: mimetypes = list() for ffi in root.findAll('FileFormatIdentifier'): if ffi.find('IdentifierType', string='MIME'): - mimetypes.append(ffi.find('Identifier').text) + mt = ffi.find('Identifier').text + mts[mt] = mts.get(mt, MediaType(id=mt)) + mimetypes.append(mts[mt]) f_mimetypes = mimetypes # Create record: f = Format( @@ -78,7 +90,7 @@ def get_formats(self): registry_url=f"https://www.nationalarchives.gov.uk/pronom/{ffd_id}", registry_source_data_url=f"https://www.nationalarchives.gov.uk/pronom/{ffd_id}.xml", registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/pronom/{ffd_id}.xml", - additional_fields= None, + #additional_fields= None, created=self._date_parser(root.find('ProvenanceSourceDate').text), last_modified=self._date_parser(root.find('LastUpdatedDate').text), ) diff --git a/foreging/tcdb.py b/foreging/tcdb.py index f13a923d..9ed0f3f2 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -1,7 +1,7 @@ import os import csv import logging -from .models import Format, Software +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -14,24 +14,32 @@ class TCDB(): registry_id = "tcdb" registry_url = f"https://github.com/thorsted/Born-Digital-Scripts/tree/main/TC%20Identification" source_file = 'digipres.github.io/_sources/registries/tcdb/TCDB_2003.8_data-cleaned.csv' - warnings = [] + registry = Registry( + id=registry_id, + name="TCDB", + url=registry_url, + index_data_url=source_file + ) - def get_formats(self): + def get_formats(self, exts, mts, genres): logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) # First, gather rows by type_code... rows_by_type_code = {} # Open, coping with Unicode BOM + line = 1 with open(self.source_file, "r", encoding='utf-8-sig') as csv_file: reader = csv.DictReader(csv_file) for row in reader: logger.debug(f"Processing row: {row}") type_code = row['Type'].strip() - if type_code not in rows_by_type_code: - rows_by_type_code[type_code] = [] + rows_by_type_code[type_code] = rows_by_type_code.get(type_code, []) rows_by_type_code[type_code].append(row) + line += 1 + row['_line_number'] = line # Now, process each type_code: + sws = {} for type_code, rows in rows_by_type_code.items(): readers = [] extensions = set() @@ -40,44 +48,49 @@ def get_formats(self): for row in rows: logger.debug(f"Processing row: {row}") creator_code = row['Creator'].strip() - extensions.add(row['Extension'].strip()) - categories.add(row['Category'].strip()) + # + ext = row['Extension'].strip().lower() + if ext: + exts[ext] = exts.get(ext,Extension(id=ext)) + extensions.add(exts[ext]) + # + cat = row['Category'].strip() + if cat: + genres[cat] = genres.get(cat, Genre(name=cat)) + categories.add(genres[cat]) + # names.append(row['File Name'].strip()) - s = Software( - registry_id=self.registry_id, - id=f"tcdb:{type_code}:{creator_code}", - name=row['File Name'].strip(), - version=None, - summary=row['Comments'].strip(), - registry_url=self.registry_url, - reads=[f"tcdb:{type_code}"] + # Record the Software ID, adding a line number to make sure everything has distinct IDs. + sw_id = f"tcdb:{type_code}:{creator_code}@L{row['_line_number']}" + sws[sw_id] = sws.get(sw_id, + Software( + registry=self.registry, + id=sw_id, + name=row['File Name'].strip(), + version=None, + summary=row['Comments'].strip() + ) ) - readers.append(s) - # Set up as a format entity: + readers.append(sws[sw_id]) + # Set up as a format entity for this type_code: f = Format( - registry_id=self.registry_id, + registry=self.registry, id=f"tcdb:{type_code}", name= ", ".join(names)[:256], # FIXME Limit size as this includes too much software information and is very slow to work with! version=None, summary=None, - genres=[x for x in categories if x], - extensions=[x.lower() for x in extensions if x], + genres=list(categories), + extensions=list(extensions), iana_media_types=[], has_magic=False, primary_media_type=None, parent_media_type=None, - registry_url=self.registry_url, - registry_source_data_url=f"https://github.com/thorsted/Born-Digital-Scripts/blob/main/TC%20Identification/TCDB_2003.8_data-cleaned.csv", - registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/tcdb/TCDB_2003.8_data-cleaned.csv", + registry_url=None, + registry_source_data_url=None, + registry_index_data_url=None, created=None, last_modified=None, - readers=readers, + readers=readers ) logger.debug(f"Generated format: {f}") yield f - - -if __name__ == "__main__": - gen = TCDB() - for f in gen.get_formats(): - print(f.model_dump_json()) \ No newline at end of file diff --git a/foreging/wikidata.py b/foreging/wikidata.py index df2e336f..70a918cb 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -1,10 +1,6 @@ -from collections import defaultdict -import os import json import logging -from .models_sql import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry -from sqlmodel import Field, Relationship, Session, SQLModel, create_engine -import argparse +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -29,15 +25,13 @@ class WikiData(): ) - def get_formats(self): + def get_formats(self, exts, mts, genres): logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) with open (self.fmt_source_file, 'r') as f: wd = json.load(f) fmts = {} - exts = {} - mts = {} warnings = set() current_qid = None @@ -162,20 +156,3 @@ def make_software(self, info): return s -if __name__ == "__main__": - sqlite_file_name = "database.db" - sqlite_url = f"sqlite:///{sqlite_file_name}" - - engine = create_engine(sqlite_url, echo=False) - - SQLModel.metadata.create_all(engine) - - with Session(engine) as session: - - gen = WikiData() - i = 0 - for f in gen.get_formats(): - session.add(f) - i += 1 - if i % 100 == 0: - session.commit() From 1f42368a444e7805767fdc20806b517183bbbc5e Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 14 Feb 2025 18:43:48 +0000 Subject: [PATCH 23/53] LC FDD now included. --- foreging/loc_fdd.py | 78 ++++++++++++++++++++++++++++---------------- foreging/populate.py | 47 +++++++++++++++++++++----- foreging/pronom.py | 2 -- foreging/tcdb.py | 2 -- foreging/wikidata.py | 1 - 5 files changed, 88 insertions(+), 42 deletions(-) diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index 4015ed5f..dde2dafb 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -1,23 +1,27 @@ import os import logging from bs4 import BeautifulSoup -from .models import Format +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class LocFDD(): - registry_id = "loc-fdd" + registry_id = "lcfdd" source_folder = 'digipres.github.io/_sources/registries/fdd/fddXML' - warnings = [] show_parsed_xml_on_errors = False + registry = Registry( + id=registry_id, + name="LC FDD" + ) - def get_formats(self): - logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) + def get_formats(self, exts, mts, genres): + + fmts = {} for filename in os.listdir(self.source_folder): if filename.endswith(".xml"): - logger.info(f"Parsing {filename}...") + logger.debug(f"Parsing {filename}...") with open(f"{self.source_folder}/{filename}", "rb") as f: xml = f.read() root = None @@ -28,6 +32,18 @@ def get_formats(self): root = BeautifulSoup(xml, "xml") ffd_id = root.find('FDD').get('id') f_name = root.find('FDD').get('titleName') + + # Check if we should keep this one, or if something seems to have gone wrong: + if filename != f"{ffd_id}.xml": + self.registry.data_log.append( + RegistryDataLogEntry( + level="error", + message=f"File name of {filename} does not match embedded FDD ID of {ffd_id}", + url=f"https://www.loc.gov/preservation/digital/formats/fddXML/{filename}" + ) + ) + continue + # If there's a version string, grab it: f_version = None if ", Version " in f_name: @@ -36,58 +52,64 @@ def get_formats(self): f_genres = list() for gns in root.findAll('gdfrGenreSelection'): for gn in gns.findAll('gdfrGenre'): - f_genres.append(f"gdfr:{gn.text}") + f_genres.append(Genre(name=f"gdfr:{gn.text}")) # Haz Magic? if root.find('magicNumbers'): f_magic = True else: f_magic = False # Get extensions: - extensions = list() + f_extensions = set() for fe in root.findAll('filenameExtension'): for fev in fe.findAll('sigValue'): - extensions.append("%s" % fev.text) - f_extensions = extensions + ext = f"{fev.text}" + exts[ext] = exts.get(ext, Extension(id=ext)) + f_extensions.add(exts[ext]) # Get MIME types: - mimetypes = list() + f_mimetypes = set() for imts in root.findAll('internetMediaType'): for mt in imts.findAll('sigValue'): - mimetypes.append(mt.text) - f_mimetypes = mimetypes + f_mimetypes.add(mt.text) # Find the date: edit_date = root.findAll('date')[-1].text # Create record: f = Format( - registry_id=self.registry_id, - id=ffd_id, + registry=self.registry, + id=f"{self.registry_id}:{ffd_id}", name=f_name, version=f_version, summary=root.find("shortDescription").text, genres=f_genres, - extensions=f_extensions, - iana_media_types=f_mimetypes, + extensions=list(f_extensions), + #iana_media_types=f_mimetypes, has_magic=f_magic, primary_media_type=None, parent_media_type=None, - registry_url=f"https://www.loc.gov/preservation/digital/formats/fdd/{ffd_id}.shtml", - registry_source_data_url=f"https://www.loc.gov/preservation/digital/formats/fdd/{ffd_id}.xml", + registry_url=f"https://www.loc.gov/preservation/digital/formats/fdd/{ffd_id}/", + registry_source_data_url=f"https://www.loc.gov/preservation/digital/formats/fddXML/{filename}", registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/fdd/fddXML/{ffd_id}.xml", additional_fields= None, - created=edit_date, - last_modified=edit_date, + #created=edit_date, + #last_modified=edit_date, ) - yield f + fmts[ffd_id] = f + except Exception as e: - logger.error(f"Parsing {filename} failed: {e}") - self.warnings.append(f"Error when parsing XML from '{filename}': {e}") + logger.error(f"Parsing {filename} {ffd_id} failed: {e}") + self.registry.data_log.append( + RegistryDataLogEntry( + level='error', + message=f"Error when parsing XML from '{filename}': {e}" + ) + ) # Emit extra debug info if possible: if root and self.show_parsed_xml_on_errors: logger.error("XML parsed as:") logger.error(root.prettify()) #print(etree.tostring(root, pretty_print=True).decode('utf-8')) + # Return the values: + for id in fmts: + f = fmts[id] + yield f -if __name__ == "__main__": - gen = LocFDD() - for f in gen.get_formats(): - print(f.model_dump_json()) \ No newline at end of file diff --git a/foreging/populate.py b/foreging/populate.py index d83a0862..aa28846c 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -1,18 +1,30 @@ +from .loc_fdd import LocFDD +from .nara import NARA from .pronom import PRONOM from .tcdb import TCDB from .wikidata import WikiData + from sqlmodel import Session, SQLModel, create_engine +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) +# Size of the chunks of data to commit (makes things faster but more memory load) +COMMIT_SIZE = 200 +# Push in the data: def populate_database(session, gen, exts, mts, genres): - i = 0 - for f in gen.get_formats(exts, mts, genres): - session.add(f) - i += 1 - if i % 200 == 0: - session.commit() - # And get the last few in: - session.commit() + logger.info("Getting transformed format records for registry ID %s..." % gen.registry_id) + # Counter to stage commits in chunks + i = 0 + for f in gen.get_formats(exts, mts, genres): + session.add(f) + i += 1 + if i % COMMIT_SIZE == 0: + session.commit() + # And get the last few in: + session.commit() if __name__ == "__main__": @@ -30,8 +42,25 @@ def populate_database(session, gen, exts, mts, genres): SQLModel.metadata.create_all(engine) with Session(engine) as session: + # FFW + + # GithubLinguist + + # LC FDD + populate_database(session, LocFDD(), exts, mts, genres) + # NARA + ##populate_database(session, NARA(), exts, mts, genres) + # PRONOM populate_database(session, PRONOM(), exts, mts, genres) - populate_database(session, WikiData(), exts, mts, genres) + # TCDB populate_database(session, TCDB(), exts, mts, genres) + # Tika + + # TRiD + + # WikiData + populate_database(session, WikiData(), exts, mts, genres) + + diff --git a/foreging/pronom.py b/foreging/pronom.py index 228c74d2..344b8eb9 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -25,8 +25,6 @@ def _date_parser(self, pronom_date): return date def get_formats(self, exts, mts, genres): - logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) - for source_folder_name in ['fmt', 'x-fmt']: source_folder = os.path.join(self.source_folder, source_folder_name) diff --git a/foreging/tcdb.py b/foreging/tcdb.py index 9ed0f3f2..6e8c0c78 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -22,8 +22,6 @@ class TCDB(): ) def get_formats(self, exts, mts, genres): - logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) - # First, gather rows by type_code... rows_by_type_code = {} # Open, coping with Unicode BOM diff --git a/foreging/wikidata.py b/foreging/wikidata.py index 70a918cb..d579e172 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -26,7 +26,6 @@ class WikiData(): def get_formats(self, exts, mts, genres): - logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) with open (self.fmt_source_file, 'r') as f: wd = json.load(f) From eb36ac6eb631f4ced59905920a3f5a1f0b978a73 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 14 Feb 2025 20:22:04 +0000 Subject: [PATCH 24/53] Add NARA, fix up build system. --- Makefile | 38 ++++++++------------------- derive.sh | 2 +- foreging/models.py | 2 +- foreging/nara.py | 61 +++++++++++++++++++++++++++++++------------- foreging/populate.py | 41 +++++++++++++++-------------- foreging/wikidata.py | 2 +- 6 files changed, 77 insertions(+), 69 deletions(-) diff --git a/Makefile b/Makefile index 2f6c0828..cfd0b3bb 100644 --- a/Makefile +++ b/Makefile @@ -1,31 +1,15 @@ -DATAFILES := data/pronom.jsonl data/loc.jsonl data/nara.jsonl data/tcdb.jsonl data/wikidata.jsonl +all: registries.db -all: data data/registries.db - -data: +registries.db: foreging/*.py + rm -f $@ $@.tmp mkdir -p data + python -m foreging.populate $@.tmp + sqlite-utils enable-fts $@.tmp format name version summary + sqlite-utils enable-fts $@.tmp media_type id + sqlite-utils enable-fts $@.tmp extension id + sqlite-utils enable-fts $@.tmp genre name + sqlite-utils enable-fts $@.tmp software name version summary + sqlite-utils enable-fts $@.tmp registry_data_log level message + mv $@.tmp $@ -data/pronom.jsonl: foreging/pronom.py digipres.github.io/_sources/registries - python -m foreging.pronom > $@ - -data/loc_fdd.jsonl: foreging/loc_fdd.py digipres.github.io/_sources/registries - python -m foreging.loc_fdd > $@ - -data/nara.jsonl: foreging/nara.py digipres.github.io/_sources/registries - python -m foreging.nara > $@ - -data/tcdb.jsonl: foreging/tcdb.py digipres.github.io/_sources/registries - python -m foreging.tcdb > $@ - -data/wikidata.jsonl: foreging/wikidata.py digipres.github.io/_sources/registries - python -m foreging.wikidata > $@ - -data/registries.db: $(DATAFILES) - rm -f $@ - sqlite-utils insert $@ formats --nl data/pronom.jsonl - sqlite-utils insert $@ formats --nl data/loc_fdd.jsonl - sqlite-utils insert $@ formats --nl data/nara.jsonl - sqlite-utils insert $@ formats --nl data/tcdb.jsonl - sqlite-utils insert $@ formats --nl data/wikidata.jsonl - sqlite-utils enable-fts $@ formats name summary extensions iana_media_types genres readers writers additional_fields \ No newline at end of file diff --git a/derive.sh b/derive.sh index c2eb3d93..7a0fc294 100755 --- a/derive.sh +++ b/derive.sh @@ -5,5 +5,5 @@ source venv/bin/activate make -cp data/registries.db digipres.github.io/_data/formats/ +cp registries.db digipres.github.io/_data/formats/ diff --git a/foreging/models.py b/foreging/models.py index 9d7345e4..a1bfb3b6 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -109,7 +109,7 @@ def __eq__(self,other): class Format(SQLModel, table=True): id: str | None = Field(default=None, primary_key=True) - name: str = Field(index=True) + name: str | None = Field(index=True) version: str | None = Field(index=True) summary: str | None = Field(index=True) genres: list["Genre"] = Relationship(back_populates="formats", link_model=FormatGenresLink) diff --git a/foreging/nara.py b/foreging/nara.py index 424298b4..a95aff87 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -1,11 +1,10 @@ -import os +import json import logging from rdflib import Graph, RDF, DCTERMS from rdflib.namespace import DefinedNamespace, Namespace from rdflib.term import URIRef -from .models import Format +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry -logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # @@ -40,21 +39,51 @@ class NARA_FFPP(): registry_id = "nara-ffpp" source_file = 'digipres.github.io/_sources/registries/nara/fileformats.ttl' warnings = [] + registry = Registry( + id=registry_id, + name="NARA FFPP", + url="https://www.archives.gov/preservation/digital-preservation/linked-data" + ) - def get_formats(self): - logger.info("Getting transformed format records for registry ID %s..." % self.registry_id) + def get_formats(self, exts, mts, grs): g = Graph() g.parse(self.source_file) - for s, p, o in g.triples((None, RDF.type, NARA.FileFormat)): - ff_id = g.value(s, DCTERMS.identifier) - additionals = {} - for p in [ NARA.preservationAction, NARA.preservationPlan, NARA.tools, WDT.p2748, WDT.p3381, WDT.p973]: + ff_id = f"{self.registry_id}:{g.value(s, DCTERMS.identifier)}" + # Grab: Action, Plan, Tools, PUID, FFW, Described-At + additional = {} + for p in [ NARA.preservationAction, NARA.preservationPlan, WDT.p2748, WDT.p3381, WDT.p973]: value = g.value(s, p) if value: - additionals[p] = [o for s, p, o in g.triples((s, p, None))] + additional[p] = [o for s, p, o in g.triples((s, p, None))] + logger.debug("Additional fields: " + json.dumps(additional, indent=2)) + # Set up entities: + extensions = set() + for ext in [o for s, p, o in g.triples((s, WDT.p1195, None))]: + ext = str(ext) + exts[ext] = exts.get(ext, Extension(id=ext)) + extensions.add(exts[ext]) + genres = [] + for genre in [o for s, p, o in g.triples((s, NARA.category, None))]: + genre = str(genre) + grs[genre] = grs.get(genre, Genre(name=genre)) + genres.append(grs[genre]) + media_types = [] + for mt in [o for s, p, o in g.triples((s, WDT.p1163, None))]: + mt = str(mt) + mts[mt] = mts.get(mt, MediaType(id=mt)) + media_types.append(mts[mt]) + readers = [] + for tool in [o for s, p, o in g.triples((s, NARA.tools, None))]: + s = Software( + registry=self.registry, + id=f"{ff_id}+{len(readers)}", + name=str(tool) + ) + readers.append(s) + # Set up as a format entity: f = Format( registry_id=self.registry_id, @@ -62,23 +91,19 @@ def get_formats(self): name=g.value(s, NARA.formatName), version=None, summary=g.value(s, DCTERMS.description), - genres=[o for s, p, o in g.triples((s, NARA.category, None))], - extensions=[o for s, p, o in g.triples((s, WDT.p1195, None))], - iana_media_types=[o for s, p, o in g.triples((s, WDT.p1163, None))], + genres=genres, + extensions=list(extensions), + media_types=media_types, has_magic=False, primary_media_type=None, parent_media_type=None, registry_url=f"https://www.archives.gov/preservation/digital-preservation/linked-data#{ff_id}", registry_source_data_url=f"https://www.archives.gov/files/lod/dpframework/id/{ff_id}.ttl", registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/nara/fileformats.ttl#{ff_id}", - additional_fields=additionals, created=None, last_modified=None, + readers=readers ) yield f -if __name__ == "__main__": - gen = NARA_FFPP() - for f in gen.get_formats(): - print(f.model_dump_json()) \ No newline at end of file diff --git a/foreging/populate.py b/foreging/populate.py index aa28846c..6641b708 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -1,17 +1,18 @@ from .loc_fdd import LocFDD -from .nara import NARA +from .nara import NARA_FFPP from .pronom import PRONOM from .tcdb import TCDB from .wikidata import WikiData from sqlmodel import Session, SQLModel, create_engine +import argparse import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Size of the chunks of data to commit (makes things faster but more memory load) -COMMIT_SIZE = 200 +COMMIT_SIZE = 250 # Push in the data: def populate_database(session, gen, exts, mts, genres): @@ -27,6 +28,18 @@ def populate_database(session, gen, exts, mts, genres): session.commit() if __name__ == "__main__": + # Registries + registries = {} + for r in [LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), WikiData()]: + registries[r.registry.id] = r + # TO-ADD: FFW, GithubLinguist, Tika, TRiD + + # Args + parser = argparse.ArgumentParser() + parser.add_argument('--only', required=False, choices=registries.keys()) + parser.add_argument('output_file') + args = parser.parse_args() + # Cache the cross-referenced entities: exts = {} @@ -34,7 +47,7 @@ def populate_database(session, gen, exts, mts, genres): genres = {} # Set up the session - sqlite_file_name = "database.db" + sqlite_file_name = args.output_file sqlite_url = f"sqlite:///{sqlite_file_name}" engine = create_engine(sqlite_url, echo=False) @@ -42,24 +55,10 @@ def populate_database(session, gen, exts, mts, genres): SQLModel.metadata.create_all(engine) with Session(engine) as session: - # FFW - - # GithubLinguist - - # LC FDD - populate_database(session, LocFDD(), exts, mts, genres) - # NARA - ##populate_database(session, NARA(), exts, mts, genres) - # PRONOM - populate_database(session, PRONOM(), exts, mts, genres) - # TCDB - populate_database(session, TCDB(), exts, mts, genres) - # Tika - - # TRiD - - # WikiData - populate_database(session, WikiData(), exts, mts, genres) + for reg_id in registries: + reg = registries[reg_id] + if args.only == None or args.only == reg_id: + populate_database(session, reg, exts, mts, genres) diff --git a/foreging/wikidata.py b/foreging/wikidata.py index d579e172..5af9fb22 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -87,7 +87,7 @@ def get_formats(self, exts, mts, genres): # Check it's in the set: if qid not in fmts: warning = f"Software entry '{sw_qid}: {sw['formatLabel']}' references missing format '{qid}'" - logger.warning( warning ) + logger.debug( warning ) warnings.add( RegistryDataLogEntry(level="warning", message=warning, url=sw['source'] ) ) continue if sw_qid not in sws: From 3ec38c2e0ed506586b626c5ec2c0add8c7bde055 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 14 Feb 2025 20:41:01 +0000 Subject: [PATCH 25/53] Add dates that had been missed. --- foreging/loc_fdd.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index dde2dafb..226288b2 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -1,5 +1,6 @@ import os import logging +import datetime from bs4 import BeautifulSoup from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry @@ -69,9 +70,20 @@ def get_formats(self, exts, mts, genres): f_mimetypes = set() for imts in root.findAll('internetMediaType'): for mt in imts.findAll('sigValue'): - f_mimetypes.add(mt.text) + mt = mt.text + mts[mt] = mts.get(mt, MediaType(id=mt)) + f_mimetypes.add(mts[mt]) # Find the date: edit_date = root.findAll('date')[-1].text + try: + edit_date = datetime.date.fromisoformat(edit_date) + except ValueError: + self.registry.data_log.append(RegistryDataLogEntry( + level='warning', + message=f"Unexpected data format '{edit_date}' for record {ffd_id}, expected 'YYYY-MM-DD'.", + )) + edit_date = None + # Create record: f = Format( registry=self.registry, @@ -81,7 +93,7 @@ def get_formats(self, exts, mts, genres): summary=root.find("shortDescription").text, genres=f_genres, extensions=list(f_extensions), - #iana_media_types=f_mimetypes, + media_types=list(f_mimetypes), has_magic=f_magic, primary_media_type=None, parent_media_type=None, @@ -89,8 +101,8 @@ def get_formats(self, exts, mts, genres): registry_source_data_url=f"https://www.loc.gov/preservation/digital/formats/fddXML/{filename}", registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/fdd/fddXML/{ffd_id}.xml", additional_fields= None, - #created=edit_date, - #last_modified=edit_date, + created=edit_date, + last_modified=edit_date, ) fmts[ffd_id] = f From 2ac5827940b747fe9d790fc1d2c7f6951284e0dc Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 14 Feb 2025 23:55:10 +0000 Subject: [PATCH 26/53] Couple of fixes, PRONOM prefix and media types. --- foreging/pronom.py | 4 ++-- foreging/tcdb.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/foreging/pronom.py b/foreging/pronom.py index 344b8eb9..0fbbe555 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -39,7 +39,7 @@ def get_formats(self, exts, mts, genres): #parser = etree.XMLParser() #root = etree.parse(BytesIO(xml), parser) root = BeautifulSoup(xml, "xml") - ffd_id = f"{source_folder_name}/{filename[0:-4]}" + ffd_id = f"pronom:{source_folder_name}/{filename[0:-4]}" f_name = root.find('FormatName').text # Genres: f_types = root.find('FormatTypes').text.strip().split(',') @@ -81,7 +81,7 @@ def get_formats(self, exts, mts, genres): summary=root.find("FormatDescription").text, genres=f_types, extensions=f_extensions, - iana_media_types=f_mimetypes, + media_types=f_mimetypes, has_magic=f_magic, primary_media_type=None, parent_media_type=None, diff --git a/foreging/tcdb.py b/foreging/tcdb.py index 6e8c0c78..501760ae 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -79,7 +79,7 @@ def get_formats(self, exts, mts, genres): summary=None, genres=list(categories), extensions=list(extensions), - iana_media_types=[], + media_types=[], has_magic=False, primary_media_type=None, parent_media_type=None, From 3d1605c25b56a2be0addff7a7ea0cc03d5096243 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sat, 15 Feb 2025 10:32:50 +0000 Subject: [PATCH 27/53] Some NARA fixes. --- foreging/loc_fdd.py | 4 +++- foreging/models.py | 1 + foreging/nara.py | 19 ++++++++++--------- foreging/pronom.py | 3 ++- foreging/tcdb.py | 3 ++- foreging/wikidata.py | 1 + 6 files changed, 19 insertions(+), 12 deletions(-) diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index 226288b2..25beecb3 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -13,7 +13,9 @@ class LocFDD(): show_parsed_xml_on_errors = False registry = Registry( id=registry_id, - name="LC FDD" + name="Library of Congress Format Description Documents", + url="https://www.loc.gov/preservation/digital/formats/", + id_prefix='https://www.loc.gov/preservation/digital/formats/fdd/' ) def get_formats(self, exts, mts, genres): diff --git a/foreging/models.py b/foreging/models.py index a1bfb3b6..c12fe183 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -6,6 +6,7 @@ class Registry(SQLModel, table=True): id: str | None = Field(default=None, primary_key=True) name: str = Field(index=True) url: str | None = Field() + id_prefix: str | None = Field() index_data_url: str | None = Field() data_log: list["RegistryDataLogEntry"] = Relationship() diff --git a/foreging/nara.py b/foreging/nara.py index a95aff87..98badc6e 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -36,13 +36,14 @@ class WDT(DefinedNamespace): # NARA File Format Preservation Plan parser # class NARA_FFPP(): - registry_id = "nara-ffpp" + registry_id = "naradpf" source_file = 'digipres.github.io/_sources/registries/nara/fileformats.ttl' warnings = [] registry = Registry( id=registry_id, - name="NARA FFPP", - url="https://www.archives.gov/preservation/digital-preservation/linked-data" + name="NARA Digital Preservation Framework", + url="https://www.archives.gov/preservation/digital-preservation/linked-data", + id_prefix='https://www.archives.gov/files/lod/dpframework/id/' ) def get_formats(self, exts, mts, grs): @@ -51,7 +52,7 @@ def get_formats(self, exts, mts, grs): g.parse(self.source_file) for s, p, o in g.triples((None, RDF.type, NARA.FileFormat)): - ff_id = f"{self.registry_id}:{g.value(s, DCTERMS.identifier)}" + ff_id = f"{self.registry_id}:{g.value(s, DCTERMS.identifier)}.ttl" # Grab: Action, Plan, Tools, PUID, FFW, Described-At additional = {} for p in [ NARA.preservationAction, NARA.preservationPlan, WDT.p2748, WDT.p3381, WDT.p973]: @@ -59,7 +60,7 @@ def get_formats(self, exts, mts, grs): if value: additional[p] = [o for s, p, o in g.triples((s, p, None))] logger.debug("Additional fields: " + json.dumps(additional, indent=2)) - # Set up entities: + # Set up other fields: extensions = set() for ext in [o for s, p, o in g.triples((s, WDT.p1195, None))]: ext = str(ext) @@ -77,13 +78,13 @@ def get_formats(self, exts, mts, grs): media_types.append(mts[mt]) readers = [] for tool in [o for s, p, o in g.triples((s, NARA.tools, None))]: - s = Software( + sw = Software( registry=self.registry, - id=f"{ff_id}+{len(readers)}", + id=f"{ff_id}#{len(readers)}", name=str(tool) ) - readers.append(s) - + readers.append(sw) + # Set up as a format entity: f = Format( registry_id=self.registry_id, diff --git a/foreging/pronom.py b/foreging/pronom.py index 0fbbe555..a64ecd47 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -14,8 +14,9 @@ class PRONOM(): show_parsed_xml_on_errors = False registry = Registry( id=registry_id, - name="PRONOM", + name="UK National Archives PRONOM Technical Registry", url="https://www.nationalarchives.gov.uk/PRONOM/", + id_prefix='https://www.nationalarchives.gov.uk/PRONOM/', index_data_url=f"https://github.com/digipres/{source_folder}" ) diff --git a/foreging/tcdb.py b/foreging/tcdb.py index 501760ae..ebec8bd1 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -16,8 +16,9 @@ class TCDB(): source_file = 'digipres.github.io/_sources/registries/tcdb/TCDB_2003.8_data-cleaned.csv' registry = Registry( id=registry_id, - name="TCDB", + name="Macintosh Type/Creator Codes Database", url=registry_url, + id_prefix=None, index_data_url=source_file ) diff --git a/foreging/wikidata.py b/foreging/wikidata.py index 5af9fb22..bef0238c 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -21,6 +21,7 @@ class WikiData(): id=registry_id, name="WikiData", url="https://www.wikidata.org/wiki/Wikidata:WikiProject_Informatics/Structures/File_formats", + id_prefix='http://www.wikidata.org/entity/', index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{source_file_dir}" ) From fd1dceb71bc9b88b638305a175801dce732642b3 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sat, 15 Feb 2025 12:35:37 +0000 Subject: [PATCH 28/53] Switch TCDB data fields. --- foreging/tcdb.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/foreging/tcdb.py b/foreging/tcdb.py index ebec8bd1..a1e1f8cf 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -60,14 +60,14 @@ def get_formats(self, exts, mts, genres): # names.append(row['File Name'].strip()) # Record the Software ID, adding a line number to make sure everything has distinct IDs. - sw_id = f"tcdb:{type_code}:{creator_code}@L{row['_line_number']}" + sw_id = f"tcdb:{type_code}:{creator_code}#L{row['_line_number']}" sws[sw_id] = sws.get(sw_id, Software( registry=self.registry, id=sw_id, - name=row['File Name'].strip(), + name=row['Comments'].strip(), # Software name usually stored in the Comments field. version=None, - summary=row['Comments'].strip() + summary=row['File Name'].strip() ) ) readers.append(sws[sw_id]) From e651e6a1c20ebca749f75060d3ed6829cb7aef7f Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sat, 15 Feb 2025 13:12:19 +0000 Subject: [PATCH 29/53] Missed dependency. --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ef7e6115..29820259 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,8 @@ dependencies = [ "rdflib", "pydantic", "sqlite-utils", + "sqlmodel" ] [tool.setuptools.packages.find] -include = ["foreging"] \ No newline at end of file +include = ["foreging"] From ef992e3d39e236fae51fc7cc32e43f8305d05dd8 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sat, 15 Feb 2025 13:12:26 +0000 Subject: [PATCH 30/53] Split NARA tools. --- foreging/nara.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/foreging/nara.py b/foreging/nara.py index 98badc6e..4c17cc66 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -77,13 +77,14 @@ def get_formats(self, exts, mts, grs): mts[mt] = mts.get(mt, MediaType(id=mt)) media_types.append(mts[mt]) readers = [] - for tool in [o for s, p, o in g.triples((s, NARA.tools, None))]: - sw = Software( - registry=self.registry, - id=f"{ff_id}#{len(readers)}", - name=str(tool) - ) - readers.append(sw) + for tools in [o for s, p, o in g.triples((s, NARA.tools, None))]: + for tool in str(tools).split(';'): + sw = Software( + registry=self.registry, + id=f"{ff_id}#{len(readers)}", + name=tool.strip() + ) + readers.append(sw) # Set up as a format entity: f = Format( From f384631c4903968a5819dadb4894a2fda2dbde72 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sat, 15 Feb 2025 22:49:07 +0000 Subject: [PATCH 31/53] Added FFW import using current minimal data. --- foreging/ffw.py | 67 ++++++++++++++++++++++++++++++++++++++++++++ foreging/nara.py | 3 +- foreging/populate.py | 11 +++++--- foreging/pronom.py | 2 +- 4 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 foreging/ffw.py diff --git a/foreging/ffw.py b/foreging/ffw.py new file mode 100644 index 00000000..c6d49494 --- /dev/null +++ b/foreging/ffw.py @@ -0,0 +1,67 @@ +import json +import yaml +import logging +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# +class FFW(): + source_file = "digipres.github.io/_sources/registries/mediawikis/ffw.yml" + # Set up the Registry object for this class: + registry_id = "ffw" + registry = Registry( + id=registry_id, + name="Just Solve The Problem File Formats Wiki", + url="http://fileformats.archiveteam.org/", + id_prefix='http://fileformats.archiveteam.org/wiki/', + index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{source_file}" + ) + + def get_formats(self, exts, mts, gnrs): + stream = open(self.source_file, 'r') + ffw = yaml.safe_load(stream) + stream.close() + + for fmt in ffw['formats']: + f_info = {} + f_info['extensions'] = set() + f_info['mimetypes'] = set() + f_info['hasMagic'] = False + ff_id = 'ffw:' + fmt['name'] + for key in fmt: + if key == 'extensions': + for ext in fmt[key]: + if ext: + ext=ext.lower() + exts[ext] = exts.get(ext, Extension(id=ext)) + f_info['extensions'].add(exts[ext]) + elif key == 'mimetypes': + for mt in fmt[key]: + mts[mt] = mts.get(mt, MediaType(id=mt)) + f_info['mimetypes'].add(mts[mt]) + else: + f_info[key] = fmt[key] + + # Set up as a format entity: + f = Format( + registry=self.registry, + id=ff_id, + name=f_info['name'], + version=None, + summary=None, + genres=[], + extensions=list(f_info['extensions']), + media_types=list(f_info['mimetypes']), + has_magic=f_info['hasMagic'], + primary_media_type=None, + parent_media_type=None, + registry_url=f"", + registry_source_data_url=f"", + registry_index_data_url=f"", + created=None, + last_modified=None + ) + yield f \ No newline at end of file diff --git a/foreging/nara.py b/foreging/nara.py index 4c17cc66..3a465594 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -38,7 +38,6 @@ class WDT(DefinedNamespace): class NARA_FFPP(): registry_id = "naradpf" source_file = 'digipres.github.io/_sources/registries/nara/fileformats.ttl' - warnings = [] registry = Registry( id=registry_id, name="NARA Digital Preservation Framework", @@ -88,7 +87,7 @@ def get_formats(self, exts, mts, grs): # Set up as a format entity: f = Format( - registry_id=self.registry_id, + registry=self.registry, id=ff_id, name=g.value(s, NARA.formatName), version=None, diff --git a/foreging/populate.py b/foreging/populate.py index 6641b708..8acb0c05 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -1,3 +1,4 @@ +from .ffw import FFW from .loc_fdd import LocFDD from .nara import NARA_FFPP from .pronom import PRONOM @@ -12,7 +13,7 @@ logger = logging.getLogger(__name__) # Size of the chunks of data to commit (makes things faster but more memory load) -COMMIT_SIZE = 250 +COMMIT_SIZE = 1000 # Push in the data: def populate_database(session, gen, exts, mts, genres): @@ -24,13 +25,15 @@ def populate_database(session, gen, exts, mts, genres): i += 1 if i % COMMIT_SIZE == 0: session.commit() + i = 0 # And get the last few in: - session.commit() + if i > 0: + session.commit() if __name__ == "__main__": # Registries registries = {} - for r in [LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), WikiData()]: + for r in [FFW(), LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), WikiData()]: registries[r.registry.id] = r # TO-ADD: FFW, GithubLinguist, Tika, TRiD @@ -54,7 +57,7 @@ def populate_database(session, gen, exts, mts, genres): SQLModel.metadata.create_all(engine) - with Session(engine) as session: + with Session(engine).no_autoflush as session: for reg_id in registries: reg = registries[reg_id] if args.only == None or args.only == reg_id: diff --git a/foreging/pronom.py b/foreging/pronom.py index a64ecd47..f9f44ade 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -75,7 +75,7 @@ def get_formats(self, exts, mts, genres): f_mimetypes = mimetypes # Create record: f = Format( - registry_id=self.registry_id, + registry=self.registry, id=ffd_id, name=f_name, version=root.find("FormatVersion").text, From 76a04ac28be73acf267fe476f5da0e19fd5ac06b Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sat, 15 Feb 2025 23:42:36 +0000 Subject: [PATCH 32/53] Also add more FFW data. --- digipres.github.io | 2 +- foreging/ffw.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/digipres.github.io b/digipres.github.io index 64c8237d..18fae77a 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 64c8237de0f9866fe80d20c239ba9d694e4b381f +Subproject commit 18fae77a4f51fa4617c1486cb0e5356adfd408aa diff --git a/foreging/ffw.py b/foreging/ffw.py index c6d49494..77833a2a 100644 --- a/foreging/ffw.py +++ b/foreging/ffw.py @@ -29,6 +29,7 @@ def get_formats(self, exts, mts, gnrs): f_info = {} f_info['extensions'] = set() f_info['mimetypes'] = set() + f_info['categories'] = set() f_info['hasMagic'] = False ff_id = 'ffw:' + fmt['name'] for key in fmt: @@ -42,6 +43,10 @@ def get_formats(self, exts, mts, gnrs): for mt in fmt[key]: mts[mt] = mts.get(mt, MediaType(id=mt)) f_info['mimetypes'].add(mts[mt]) + elif key == 'categories': + for cat in fmt[key]: + gnrs[cat] = gnrs.get(cat, Genre(name=cat)) + f_info['categories'].add(gnrs[cat]) else: f_info[key] = fmt[key] @@ -51,16 +56,16 @@ def get_formats(self, exts, mts, gnrs): id=ff_id, name=f_info['name'], version=None, - summary=None, - genres=[], + summary=f_info.get('pageStartText', None), + genres=list(f_info['categories']), extensions=list(f_info['extensions']), media_types=list(f_info['mimetypes']), has_magic=f_info['hasMagic'], primary_media_type=None, parent_media_type=None, - registry_url=f"", - registry_source_data_url=f"", - registry_index_data_url=f"", + registry_url=fmt['source'], + registry_source_data_url=fmt['source'], + registry_index_data_url=None, created=None, last_modified=None ) From 3a55a798b1da022f63a71f4c09704ad58dfcce38 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sun, 16 Feb 2025 10:52:27 +0000 Subject: [PATCH 33/53] Added GitHub Linguist into the new DB. --- digipres.github.io | 2 +- foreging/ffw.py | 2 +- foreging/linguist.py | 69 ++++++++++++++++++++++++++++++++++++++++++++ foreging/populate.py | 5 ++-- 4 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 foreging/linguist.py diff --git a/digipres.github.io b/digipres.github.io index 18fae77a..ca75fc5c 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 18fae77a4f51fa4617c1486cb0e5356adfd408aa +Subproject commit ca75fc5c88f06c41289d927f08847947fda766c6 diff --git a/foreging/ffw.py b/foreging/ffw.py index 77833a2a..71e27434 100644 --- a/foreging/ffw.py +++ b/foreging/ffw.py @@ -31,7 +31,7 @@ def get_formats(self, exts, mts, gnrs): f_info['mimetypes'] = set() f_info['categories'] = set() f_info['hasMagic'] = False - ff_id = 'ffw:' + fmt['name'] + ff_id = self.registry_id + ':' + fmt['name'] for key in fmt: if key == 'extensions': for ext in fmt[key]: diff --git a/foreging/linguist.py b/foreging/linguist.py new file mode 100644 index 00000000..69186f65 --- /dev/null +++ b/foreging/linguist.py @@ -0,0 +1,69 @@ +import json +import yaml +import logging +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# +class Linguist(): + source_file = "digipres.github.io/_sources/registries/githublinguist/languages.yml" + # Set up the Registry object for this class: + registry_id = "linguist" + registry = Registry( + id=registry_id, + name="GitHub Linguist", + url="http://fileformats.archiveteam.org/", + id_prefix='http://fileformats.archiveteam.org/wiki/', + index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{source_file}" + ) + + def get_formats(self, exts, mts, gnrs): + stream = open(self.source_file, 'r') + ghl = yaml.safe_load(stream) + stream.close() + + for fmt_name in ghl: + fmt = ghl[fmt_name] + f_info = {} + f_info['name'] = fmt_name + f_info['extensions'] = set() + f_info['mimetypes'] = set() + f_info['hasMagic'] = False + ff_id = f"{self.registry_id}:{fmt['language_id']}" + for key in fmt: + if key == 'extensions': + for ext in fmt[key]: + if ext: + ext=ext.strip('.') # Drop the prefix dot + exts[ext] = exts.get(ext, Extension(id=ext)) + f_info['extensions'].add(exts[ext]) + elif key == 'codemirror_mime_type': + mt = fmt[key] + mts[mt] = mts.get(mt, MediaType(id=mt)) + f_info['mimetypes'].add(mts[mt]) + else: + f_info[key] = fmt[key] + + # Set up as a format entity: + f = Format( + registry=self.registry, + id=ff_id, + name=f_info['name'], + version=None, + summary=None, + genres=[], + extensions=list(f_info['extensions']), + media_types=list(f_info['mimetypes']), + has_magic=f_info['hasMagic'], + primary_media_type=None, + parent_media_type=None, + registry_url=None, + registry_source_data_url=None, + registry_index_data_url=None, + created=None, + last_modified=None + ) + yield f \ No newline at end of file diff --git a/foreging/populate.py b/foreging/populate.py index 8acb0c05..e4eff2b9 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -1,4 +1,5 @@ from .ffw import FFW +from .linguist import Linguist from .loc_fdd import LocFDD from .nara import NARA_FFPP from .pronom import PRONOM @@ -33,9 +34,9 @@ def populate_database(session, gen, exts, mts, genres): if __name__ == "__main__": # Registries registries = {} - for r in [FFW(), LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), WikiData()]: + for r in [FFW(), Linguist(), LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), WikiData()]: registries[r.registry.id] = r - # TO-ADD: FFW, GithubLinguist, Tika, TRiD + # TO-ADD: GithubLinguist, Tika, TRiD # Args parser = argparse.ArgumentParser() From b12fcf1930bcccce054ed0d28a6ad54aa13d740d Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sun, 16 Feb 2025 11:35:34 +0000 Subject: [PATCH 34/53] Added Tika back in. --- foreging/linguist.py | 6 +-- foreging/populate.py | 7 +-- foreging/tika.py | 123 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 6 deletions(-) create mode 100644 foreging/tika.py diff --git a/foreging/linguist.py b/foreging/linguist.py index 69186f65..8ece7f75 100644 --- a/foreging/linguist.py +++ b/foreging/linguist.py @@ -15,9 +15,9 @@ class Linguist(): registry = Registry( id=registry_id, name="GitHub Linguist", - url="http://fileformats.archiveteam.org/", - id_prefix='http://fileformats.archiveteam.org/wiki/', - index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{source_file}" + url="https://github.com/github-linguist/linguist", + id_prefix=None, + index_data_url=None ) def get_formats(self, exts, mts, gnrs): diff --git a/foreging/populate.py b/foreging/populate.py index e4eff2b9..59d19e56 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -4,6 +4,7 @@ from .nara import NARA_FFPP from .pronom import PRONOM from .tcdb import TCDB +from .tika import Tika from .wikidata import WikiData from sqlmodel import Session, SQLModel, create_engine @@ -14,7 +15,7 @@ logger = logging.getLogger(__name__) # Size of the chunks of data to commit (makes things faster but more memory load) -COMMIT_SIZE = 1000 +COMMIT_SIZE = 2000 # Push in the data: def populate_database(session, gen, exts, mts, genres): @@ -34,9 +35,9 @@ def populate_database(session, gen, exts, mts, genres): if __name__ == "__main__": # Registries registries = {} - for r in [FFW(), Linguist(), LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), WikiData()]: + for r in [FFW(), Linguist(), LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), Tika(), WikiData()]: registries[r.registry.id] = r - # TO-ADD: GithubLinguist, Tika, TRiD + # TO-ADD: TRiD # Args parser = argparse.ArgumentParser() diff --git a/foreging/tika.py b/foreging/tika.py new file mode 100644 index 00000000..ccc7d883 --- /dev/null +++ b/foreging/tika.py @@ -0,0 +1,123 @@ +import json +import yaml +from lxml import etree +from io import BytesIO +import logging +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# +class Tika(): + source_file = "digipres.github.io/_sources/registries/tika/tika-mimetypes.xml" + # Set up the Registry object for this class: + registry_id = "tika" + registry = Registry( + id=registry_id, + name="Apache Tika", + url="https://tika.apache.org/", + id_prefix=None, + index_data_url=None + ) + + def get_formats(self, exts, mts, gnrs): + fmts = [] + with open(self.source_file, "rb") as f: + xml = f.read() + log = [] + try: + parser = etree.XMLParser() + root = etree.parse(BytesIO(xml), parser) + except Exception as e: + log.append( + RegistryDataLogEntry( + level='warning', + message="Error when parsing XML: "+str(e) + ) + ) + parser = etree.XMLParser(recover=True) + root = etree.parse(BytesIO(xml), parser) + for ff in root.findall('mime-type'): + finfo = {} + fid = ff.get('type') + finfo['id'] = fid + finfo['source'] = f"{self.source_file}#L{ff.sourceline}" + # Build the name: + if ff.find('_comment') is not None: + finfo['name'] = ff.find('_comment').text + # Has Magic? + if ff.find('magic') is not None: + finfo['hasMagic'] = True + else: + finfo['hasMagic'] = False + # Look for extensions: + extensions = list() + for ext in ff.findall('glob'): + extension = ext.get('pattern') + exts[extension] = exts.get(extension, Extension(id=extension)) + extensions.append(exts[extension]) + finfo['extensions'] = extensions + # Look for MIME Types: + mimetypes = list() + mimetypes.append(fid) + if ff.find('alias') is not None: + for alias in ff.findall('alias'): + mt = alias.get('type') + if mt: + if mt not in mimetypes: + mimetypes.append(mt) + else: + log.append( + RegistryDataLogEntry( + level='warning', + message="Duplicate MIME type %s for type %s." % (alias.get('type'), fid) + )) + # TODO Spot duplicate aliases. + finfo['mimetypes'] = mimetypes + # Relationships: + if ff.find('sub-class-of') is not None: + finfo['supertype'] = ff.find('sub-class-of').get('type') + if finfo['supertype'] == fid: + log.append( + RegistryDataLogEntry( + level="warning", + message="Format %s has itself as a supertype!" % fid + )) + #addFormat(rid,fid,finfo) + # Also record the XML error, if there was one: + self.registry.data_log.extend(log) + + # Post-process mimetypes: + media_types = [] + for mt in finfo['mimetypes']: + mts[mt] = mts.get(mt, MediaType(id=mt)) + media_types.append(mts[mt]) + parent = finfo.get('supertype', None) + + # Set up as a format entity: + f = Format( + registry=self.registry, + id=f"{self.registry_id}:{fid}", + name=finfo.get('name', None), + version=None, + summary=None, + genres=[], + extensions=list(finfo['extensions']), + media_types=media_types, + has_magic=finfo['hasMagic'], + primary_media_type=media_types[0].id, + parent_media_type=parent, + registry_url=None, + registry_source_data_url=finfo['source'], + registry_index_data_url=None, + created=None, + last_modified=None + ) + # And record the entry: + fmts.append(f) + + # Now yield them, so all the log entries get stored too: + for f in fmts: + yield f \ No newline at end of file From 8fd329d15e1ff3374ca6dcb1e6758e03fa545171 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sun, 16 Feb 2025 11:47:41 +0000 Subject: [PATCH 35/53] Some more Tika info. --- foreging/tika.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/foreging/tika.py b/foreging/tika.py index ccc7d883..e868c5ce 100644 --- a/foreging/tika.py +++ b/foreging/tika.py @@ -12,6 +12,8 @@ # class Tika(): source_file = "digipres.github.io/_sources/registries/tika/tika-mimetypes.xml" + source_url = "https://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml" + index_url = "https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/tika/tika-mimetypes.xml" # Set up the Registry object for this class: registry_id = "tika" registry = Registry( @@ -19,7 +21,7 @@ class Tika(): name="Apache Tika", url="https://tika.apache.org/", id_prefix=None, - index_data_url=None + index_data_url=index_url ) def get_formats(self, exts, mts, gnrs): @@ -43,7 +45,7 @@ def get_formats(self, exts, mts, gnrs): finfo = {} fid = ff.get('type') finfo['id'] = fid - finfo['source'] = f"{self.source_file}#L{ff.sourceline}" + finfo['source'] = f"#L{ff.sourceline}" # Build the name: if ff.find('_comment') is not None: finfo['name'] = ff.find('_comment').text @@ -110,8 +112,8 @@ def get_formats(self, exts, mts, gnrs): primary_media_type=media_types[0].id, parent_media_type=parent, registry_url=None, - registry_source_data_url=finfo['source'], - registry_index_data_url=None, + registry_source_data_url=self.source_url + finfo['source'], + registry_index_data_url=self.index_url + finfo['source'], created=None, last_modified=None ) From 3341658c3999e75d3f400c3709a586401622a915 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sun, 16 Feb 2025 12:24:12 +0000 Subject: [PATCH 36/53] And finally, TrID. --- foreging/populate.py | 18 ++------- foreging/trid.py | 91 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 14 deletions(-) create mode 100644 foreging/trid.py diff --git a/foreging/populate.py b/foreging/populate.py index 59d19e56..3b319bd7 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -5,6 +5,7 @@ from .pronom import PRONOM from .tcdb import TCDB from .tika import Tika +from .trid import TrID from .wikidata import WikiData from sqlmodel import Session, SQLModel, create_engine @@ -14,30 +15,17 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -# Size of the chunks of data to commit (makes things faster but more memory load) -COMMIT_SIZE = 2000 - # Push in the data: def populate_database(session, gen, exts, mts, genres): logger.info("Getting transformed format records for registry ID %s..." % gen.registry_id) - # Counter to stage commits in chunks - i = 0 for f in gen.get_formats(exts, mts, genres): session.add(f) - i += 1 - if i % COMMIT_SIZE == 0: - session.commit() - i = 0 - # And get the last few in: - if i > 0: - session.commit() if __name__ == "__main__": # Registries registries = {} - for r in [FFW(), Linguist(), LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), Tika(), WikiData()]: + for r in [FFW(), Linguist(), LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), Tika(), TrID(), WikiData()]: registries[r.registry.id] = r - # TO-ADD: TRiD # Args parser = argparse.ArgumentParser() @@ -64,6 +52,8 @@ def populate_database(session, gen, exts, mts, genres): reg = registries[reg_id] if args.only == None or args.only == reg_id: populate_database(session, reg, exts, mts, genres) + # Every commit should be self-consistent at this point: + session.commit() diff --git a/foreging/trid.py b/foreging/trid.py new file mode 100644 index 00000000..5539b600 --- /dev/null +++ b/foreging/trid.py @@ -0,0 +1,91 @@ +import json +import yaml +import os +from lxml import etree +from io import BytesIO +import logging +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# +class TrID(): + source_dir = "digipres.github.io/_sources/registries/trid" + source_url = "" + index_url = "https://github.com/digipres/digipres.github.io/tree/master/_sources/registries/trid/triddefs_xml/" + # Set up the Registry object for this class: + registry_id = "trid" + registry = Registry( + id=registry_id, + name="TrID - File Identifier", + url="https://www.mark0.net/soft-trid-e.html", + id_prefix=None, + index_data_url=None + ) + + fmts = [] + + def add_format(self, fid, finfo, exts, mts, gnrs): + media_types = [] + for mt in finfo['mimetypes']: + mts[mt] = mts.get(mt, MediaType(id=mt)) + media_types.append(mts[mt]) + extensions = [] + for ext in finfo['extensions']: + exts[ext] = exts.get(ext, Extension(id=ext)) + extensions.append(exts[ext]) + # Set up as a format entity: + f = Format( + registry=self.registry, + id=f"{self.registry_id}:{fid}", + name=finfo.get('name', None), + version=None, + summary=None, + genres=[], + extensions=extensions, + media_types=media_types, + has_magic=finfo['hasMagic'], + primary_media_type=None, + parent_media_type=None, + registry_url=None, + registry_source_data_url=self.source_url + finfo['source'], + registry_index_data_url=self.index_url + finfo['source'], + created=None, + last_modified=None + ) + # And record the entry: + self.fmts.append(f) + + + + def get_formats(self, exts, mts, gnrs): + for filename in os.listdir(f'{self.source_dir}/triddefs_xml'): + if filename.endswith(".trid.xml"): + # Get Identifier? + with open(f'{self.source_dir}/triddefs_xml/'+filename, "r") as f: + finfo = {} + finfo['source'] = filename + root = etree.parse(f) + fid = filename[:-9] + finfo['name'] = root.findall('Info/FileType')[0].text + if root.find('FrontBlock') is not None: + finfo['hasMagic'] = True + else: + finfo['hasMagic'] = False + # Get extensions: + extensions = list() + for fe in root.findall('Info/Ext'): + if(fe.text != None): + for ext in fe.text.split("/"): + if ext not in extensions: + extensions.append("%s" % ext.lower()) + finfo['extensions'] = extensions + # Get MIME types: + finfo['mimetypes'] = list() + self.add_format(fid, finfo, exts, mts, gnrs) + + # Now yield them, so all the log entries get stored too: + for f in self.fmts: + yield f \ No newline at end of file From 7c50d23eea2e0c3b531c3f43f294d83a8de1b8b8 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Sun, 16 Feb 2025 15:51:19 +0000 Subject: [PATCH 37/53] Strip globs from Tika entries. --- foreging/tika.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/foreging/tika.py b/foreging/tika.py index e868c5ce..07664035 100644 --- a/foreging/tika.py +++ b/foreging/tika.py @@ -57,7 +57,7 @@ def get_formats(self, exts, mts, gnrs): # Look for extensions: extensions = list() for ext in ff.findall('glob'): - extension = ext.get('pattern') + extension = ext.get('pattern').replace('*.','') # Strip the glob exts[extension] = exts.get(extension, Extension(id=extension)) extensions.append(exts[extension]) finfo['extensions'] = extensions From 4255b4bf6798904f6e93742181eb01a5c20baf6f Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 18 Jul 2025 15:47:50 +0100 Subject: [PATCH 38/53] Update latest version of digipres repo, inc. new sources. --- digipres.github.io | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digipres.github.io b/digipres.github.io index ca75fc5c..85f4f0f7 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit ca75fc5c88f06c41289d927f08847947fda766c6 +Subproject commit 85f4f0f732b77be02b3c3a556a9afceb043d042c From a52a73e3d55a1788c0ab8a5a87e90ef732aa64af Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 18 Jul 2025 17:08:38 +0100 Subject: [PATCH 39/53] Adding file/libmagic and MediaInfo sources. --- digipres.github.io | 2 +- foreging/file.py | 68 +++++++++++++++++++++++++++++++++++++++++++ foreging/mediainfo.py | 64 ++++++++++++++++++++++++++++++++++++++++ foreging/populate.py | 4 ++- pyproject.toml | 1 + 5 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 foreging/file.py create mode 100644 foreging/mediainfo.py diff --git a/digipres.github.io b/digipres.github.io index 85f4f0f7..c4ae3013 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 85f4f0f732b77be02b3c3a556a9afceb043d042c +Subproject commit c4ae3013b3af82c3d668552f03c83db4c9d6c9b6 diff --git a/foreging/file.py b/foreging/file.py new file mode 100644 index 00000000..e92c2f51 --- /dev/null +++ b/foreging/file.py @@ -0,0 +1,68 @@ +import json +import yaml +from lxml import etree +from io import BytesIO +import logging +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# +class File(): + source_file = "digipres.github.io/_sources/registries/file/polyfile-magic.jsonl" + source_url = "https://github.com/trailofbits/polyfile/tree/master/polyfile/magic_defs" + index_url = "https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/file/polyfile-magic.jsonl" + # Set up the Registry object for this class: + registry_id = "file" + registry = Registry( + id=registry_id, + name="file libmagic (via polyfile 0.55)", + url="https://www.darwinsys.com/file/", + id_prefix=None, + index_data_url=index_url + ) + + def get_formats(self, exts, mts, gnrs): + fmts = [] + idx = 0 + with open(self.source_file, "r") as f: + for line in f.readlines(): + idx += 1 + entry = json.loads(line) + if 'extensions' in entry or 'types' in entry: + # Do the ugly book-keeping to make the SQL work: + extensions = list() + for extension in set(entry.get('extensions', [])): + exts[extension] = exts.get(extension, Extension(id=extension)) + extensions.append(exts[extension]) + media_types = [] + for mt in set(entry.get('types', [])): + mts[mt] = mts.get(mt, MediaType(id=mt)) + media_types.append(mts[mt]) + # Set up as a format entity: + f = Format( + registry=self.registry, + id=f"{self.registry_id}:{idx}", + name=entry["name"], + version=None, + summary=None, + genres=[], + extensions=extensions, + media_types=media_types, + has_magic=True, + primary_media_type=None, + parent_media_type=None, + registry_url=None, + registry_source_data_url=self.source_url, + registry_index_data_url=f"{self.index_url}#L{idx}", + created=None, + last_modified=None + ) + # And record the entry: + fmts.append(f) + + # Now yield them, so all the log entries get stored too: + for f in fmts: + yield f \ No newline at end of file diff --git a/foreging/mediainfo.py b/foreging/mediainfo.py new file mode 100644 index 00000000..f63b5a14 --- /dev/null +++ b/foreging/mediainfo.py @@ -0,0 +1,64 @@ +import json +import yaml +from lxml import etree +from io import BytesIO +import logging +from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# +class MediaInfo(): + source_file = "digipres.github.io/_sources/registries/mediainfo/mediainfo.jsonl" + source_url = "https://mediaarea.net/en/MediaInfo/Support/Formats" + index_url = "https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/mediainfo/mediainfo.jsonl" + # Set up the Registry object for this class: + registry_id = "mediainfo" + registry = Registry( + id=registry_id, + name="MediaInfo (WIP)", + url="https://mediaarea.net/en/MediaInfo/Support/Formats", + id_prefix=None, + index_data_url=index_url + ) + + def get_formats(self, exts, mts, gnrs): + fmts = [] + idx = 0 + with open(self.source_file, "r") as f: + for line in f.readlines(): + idx += 1 + entry = json.loads(line) + if 'extensions' in entry or 'types' in entry: + # Do the ugly book-keeping to make the SQL work: + extensions = list() + for extension in set(entry.get('extensions', [])): + exts[extension] = exts.get(extension, Extension(id=extension)) + extensions.append(exts[extension]) + # Set up as a format entity: + f = Format( + registry=self.registry, + id=f"{self.registry_id}:{idx}", + name=entry["name"], + version=None, + summary=None, + genres=[], + extensions=extensions, + media_types=[], + has_magic=True, + primary_media_type=None, + parent_media_type=None, + registry_url=None, + registry_source_data_url=self.source_url, + registry_index_data_url=f"{self.index_url}#L{idx}", + created=None, + last_modified=None + ) + # And record the entry: + fmts.append(f) + + # Now yield them, so all the log entries get stored too: + for f in fmts: + yield f \ No newline at end of file diff --git a/foreging/populate.py b/foreging/populate.py index 3b319bd7..b32cd6ea 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -1,6 +1,8 @@ +from .file import File from .ffw import FFW from .linguist import Linguist from .loc_fdd import LocFDD +from .mediainfo import MediaInfo from .nara import NARA_FFPP from .pronom import PRONOM from .tcdb import TCDB @@ -24,7 +26,7 @@ def populate_database(session, gen, exts, mts, genres): if __name__ == "__main__": # Registries registries = {} - for r in [FFW(), Linguist(), LocFDD(), NARA_FFPP(), PRONOM(), TCDB(), Tika(), TrID(), WikiData()]: + for r in [File(), FFW(), Linguist(), LocFDD(), MediaInfo(), NARA_FFPP(), PRONOM(), TCDB(), Tika(), TrID(), WikiData()]: registries[r.registry.id] = r # Args diff --git a/pyproject.toml b/pyproject.toml index 29820259..882e0d25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ dependencies = [ "beautifulsoup4", "lxml", "rdflib", + "polyfile", "pydantic", "sqlite-utils", "sqlmodel" From 09afd299392925e130cf94edd070f06daf582f21 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Tue, 12 Aug 2025 09:30:43 +0100 Subject: [PATCH 40/53] Moving species work out of here. --- foreging/species.py | 125 -------------------------------------------- 1 file changed, 125 deletions(-) delete mode 100644 foreging/species.py diff --git a/foreging/species.py b/foreging/species.py deleted file mode 100644 index 2baf7c7b..00000000 --- a/foreging/species.py +++ /dev/null @@ -1,125 +0,0 @@ -# Use the idea of a Species Accumulation Curve to understand the scale of the format challenge. -import csv -import yaml -import json -import logging -import argparse -from collections import defaultdict - -logging.basicConfig(level=logging.WARNING, format='%(asctime)s: %(levelname)s - %(name)s - %(message)s') - -logger = logging.getLogger(__name__) - - -def load_extensions(): - with open('digipres.github.io/_data/formats/extensions.yml') as f: - extensions = yaml.safe_load(f) - return extensions - -def reindex_by_registry(extensions): - exts = extensions['extensions'] - ext_sets = defaultdict(set) - for ext in exts: - for id in exts[ext]['identifiers']: - ext_sets[id['regId']].add(ext.lower()) - return ext_sets - -def compute_sac(): - ext_sets = reindex_by_registry(load_extensions()) - - all_extensions = set() - sample_total = 0 - - # Go though the dict of sets, sorting them so largest sets go first (note each item is the k,v array): - # Doing this seems to make the curve fitting more robust/consistent. - print("source,num_exts,num_uniq_exts,percent_uniq_exts,total_exts,total_uniq_exts,added_uniq_exts") - for set_key, ext_set in sorted(ext_sets.items(), key=lambda item: len(item[1]), reverse=True): - sample_total += len(ext_set) - current_total = len(all_extensions) - all_extensions |= ext_set - total_added = len(all_extensions) - current_total - # Calculate the unique part, by making a copy of the set and removing all other sets from it: - unique_ext = ext_set.copy() - for other_set in ext_sets: - if other_set != set_key: - unique_ext -= ext_sets[other_set] - # Share & Enjoy: - set_size = len(ext_set) - unique_size = len(unique_ext) - print(f"{set_key},{set_size},{unique_size},{100.0*unique_size/set_size:.3f},{sample_total},{len(all_extensions)},{total_added}") - - -def _print_comparison(set_key, candidate_set, collection_set, collection_counts, collection_total): - remainder = collection_set - candidate_set - common = collection_set.intersection(candidate_set) - remainder_count = 0 - for ext in remainder: - remainder_count += collection_counts[ext] - print(f"{set_key} {len(common)} {len(remainder)} {remainder_count} {collection_total}")# {json.dumps(list(remainder))}") - -def compare_csv(csv_file): - collection_set = set() - collection_counts = {} - collection_total = 0 - with open(csv_file) as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - ext = row['extension'].lower().strip() - # Drop extensions with spaces in: - if " " in ext: - logger.warning(f"Dropping extension with space in: '{ext}'") - continue - # Drop extensions that are just numbers: - if ext.isnumeric(): - logger.warning(f"Dropping extension that appears to be just a number: '{ext}'") - continue - # Convert to standard lower-case glob format - ext = f"*.{ext}" - logger.debug(f"Found extension {ext} with file_count {row['file_count']}") - collection_set.add(ext) - collection_counts[ext] = int(row['file_count']) - collection_total += int(row['file_count']) - - ext_sets = reindex_by_registry(load_extensions()) - all_extensions = set() - for set_key, ext_set in sorted(ext_sets.items(), key=lambda item: len(item[1]), reverse=True): - all_extensions |= ext_set - _print_comparison(set_key, ext_set, collection_set, collection_counts, collection_total) - _print_comparison("_ALL_", all_extensions, collection_set, collection_counts, collection_total) - -def write_extensions(output_json): - ext_sets = reindex_by_registry(load_extensions()) - with open(output_json,"w") as f: - json.dump(ext_sets, f, default=list) - - -if __name__ == "__main__": - common_args = argparse.ArgumentParser(prog="species", add_help=False) - common_args.add_argument('-v', '--verbose', action='count', default=0, help='Logging level; add more -v for more logging.') - - parser = argparse.ArgumentParser(prog="species", add_help=True) - subparsers = parser.add_subparsers(dest="action", help='action') - - parser_sac = subparsers.add_parser('curve', parents=[common_args], help="Load the extensions and compute the Species Accumulation Curve.") - - parser_cmp = subparsers.add_parser('compare', parents=[common_args], help="Compare extensions from a CSV file with the registry contents.") - parser_cmp.add_argument('csv_file', type=str, help='CSV file to load') - - parser_exts = subparsers.add_parser('extensions', parents=[common_args], help="Write the extensions data out as a JSON file.") - parser_exts.add_argument('json_file', type=str, help='JSON file to write') - - args = parser.parse_args() - - # Set up verbose logging: - if 'verbose' in args: - if args.verbose == 1: - logging.getLogger().setLevel(logging.INFO) - elif args.verbose >= 2: - logging.getLogger().setLevel(logging.DEBUG) - - if args.action == 'curve': - compute_sac() - elif args.action == 'compare': - compare_csv(args.csv_file) - elif args.action == "extensions": - write_extensions(args.json_file) \ No newline at end of file From 30809a57b1903b219ad231eefefbdc56c5e07257 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Tue, 12 Aug 2025 09:30:58 +0100 Subject: [PATCH 41/53] Updating sources. --- digipres.github.io | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/digipres.github.io b/digipres.github.io index c4ae3013..768a4ccf 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit c4ae3013b3af82c3d668552f03c83db4c9d6c9b6 +Subproject commit 768a4ccfce0277fbde55d57735988c2dc58c13fe From 6ab7c0c73c8c7f88e35c1c803b7e27f076b9c9ae Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Tue, 12 Aug 2025 16:05:48 +0100 Subject: [PATCH 42/53] Use a plain list for extensions. --- Makefile | 3 +-- foreging/db/__init__.py | 0 foreging/db/extension_sets.py | 15 +++++++++++++++ foreging/ffw.py | 9 ++++----- foreging/file.py | 7 +++---- foreging/linguist.py | 7 +++---- foreging/loc_fdd.py | 7 +++---- foreging/mediainfo.py | 7 +++---- foreging/models.py | 20 ++------------------ foreging/nara.py | 7 +++---- foreging/populate.py | 7 +++---- foreging/pronom.py | 7 +++---- foreging/tcdb.py | 7 +++---- foreging/tika.py | 7 +++---- foreging/trid.py | 11 +++++------ foreging/wikidata.py | 7 +++---- 16 files changed, 57 insertions(+), 71 deletions(-) create mode 100644 foreging/db/__init__.py create mode 100644 foreging/db/extension_sets.py diff --git a/Makefile b/Makefile index cfd0b3bb..d60d52dc 100644 --- a/Makefile +++ b/Makefile @@ -5,9 +5,8 @@ registries.db: foreging/*.py rm -f $@ $@.tmp mkdir -p data python -m foreging.populate $@.tmp - sqlite-utils enable-fts $@.tmp format name version summary + sqlite-utils enable-fts $@.tmp format name version summary extensions sqlite-utils enable-fts $@.tmp media_type id - sqlite-utils enable-fts $@.tmp extension id sqlite-utils enable-fts $@.tmp genre name sqlite-utils enable-fts $@.tmp software name version summary sqlite-utils enable-fts $@.tmp registry_data_log level message diff --git a/foreging/db/__init__.py b/foreging/db/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/foreging/db/extension_sets.py b/foreging/db/extension_sets.py new file mode 100644 index 00000000..1338184a --- /dev/null +++ b/foreging/db/extension_sets.py @@ -0,0 +1,15 @@ +import json +import sqlite3 +from collections import defaultdict +con = sqlite3.connect("registries.db") + +cur = con.cursor() + +ext_sets = defaultdict(set) +for row in cur.execute("SELECT registry_id, format.id, e.value FROM format, json_each(extensions) AS e ORDER BY e.value ASC"): + ext_sets[row[0]].add(row[2]) + +for source, ext_set in ext_sets.items(): + ext_sets[source] = list(ext_set) + +print(json.dumps(ext_sets)) \ No newline at end of file diff --git a/foreging/ffw.py b/foreging/ffw.py index 71e27434..cab39b6c 100644 --- a/foreging/ffw.py +++ b/foreging/ffw.py @@ -1,7 +1,7 @@ import json import yaml import logging -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -20,7 +20,7 @@ class FFW(): index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{source_file}" ) - def get_formats(self, exts, mts, gnrs): + def get_formats(self, mts, gnrs): stream = open(self.source_file, 'r') ffw = yaml.safe_load(stream) stream.close() @@ -36,9 +36,8 @@ def get_formats(self, exts, mts, gnrs): if key == 'extensions': for ext in fmt[key]: if ext: - ext=ext.lower() - exts[ext] = exts.get(ext, Extension(id=ext)) - f_info['extensions'].add(exts[ext]) + ext=ext.lower().strip() + f_info['extensions'].add(ext) elif key == 'mimetypes': for mt in fmt[key]: mts[mt] = mts.get(mt, MediaType(id=mt)) diff --git a/foreging/file.py b/foreging/file.py index e92c2f51..0d88aaf2 100644 --- a/foreging/file.py +++ b/foreging/file.py @@ -3,7 +3,7 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -24,7 +24,7 @@ class File(): index_data_url=index_url ) - def get_formats(self, exts, mts, gnrs): + def get_formats(self, mts, gnrs): fmts = [] idx = 0 with open(self.source_file, "r") as f: @@ -35,8 +35,7 @@ def get_formats(self, exts, mts, gnrs): # Do the ugly book-keeping to make the SQL work: extensions = list() for extension in set(entry.get('extensions', [])): - exts[extension] = exts.get(extension, Extension(id=extension)) - extensions.append(exts[extension]) + extensions.append(extension) media_types = [] for mt in set(entry.get('types', [])): mts[mt] = mts.get(mt, MediaType(id=mt)) diff --git a/foreging/linguist.py b/foreging/linguist.py index 8ece7f75..9319f512 100644 --- a/foreging/linguist.py +++ b/foreging/linguist.py @@ -1,7 +1,7 @@ import json import yaml import logging -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -20,7 +20,7 @@ class Linguist(): index_data_url=None ) - def get_formats(self, exts, mts, gnrs): + def get_formats(self, mts, gnrs): stream = open(self.source_file, 'r') ghl = yaml.safe_load(stream) stream.close() @@ -38,8 +38,7 @@ def get_formats(self, exts, mts, gnrs): for ext in fmt[key]: if ext: ext=ext.strip('.') # Drop the prefix dot - exts[ext] = exts.get(ext, Extension(id=ext)) - f_info['extensions'].add(exts[ext]) + f_info['extensions'].add(ext) elif key == 'codemirror_mime_type': mt = fmt[key] mts[mt] = mts.get(mt, MediaType(id=mt)) diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index 25beecb3..27481659 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -2,7 +2,7 @@ import logging import datetime from bs4 import BeautifulSoup -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -18,7 +18,7 @@ class LocFDD(): id_prefix='https://www.loc.gov/preservation/digital/formats/fdd/' ) - def get_formats(self, exts, mts, genres): + def get_formats(self, mts, genres): fmts = {} @@ -66,8 +66,7 @@ def get_formats(self, exts, mts, genres): for fe in root.findAll('filenameExtension'): for fev in fe.findAll('sigValue'): ext = f"{fev.text}" - exts[ext] = exts.get(ext, Extension(id=ext)) - f_extensions.add(exts[ext]) + f_extensions.add(ext) # Get MIME types: f_mimetypes = set() for imts in root.findAll('internetMediaType'): diff --git a/foreging/mediainfo.py b/foreging/mediainfo.py index f63b5a14..4caa8028 100644 --- a/foreging/mediainfo.py +++ b/foreging/mediainfo.py @@ -3,7 +3,7 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -24,7 +24,7 @@ class MediaInfo(): index_data_url=index_url ) - def get_formats(self, exts, mts, gnrs): + def get_formats(self, mts, gnrs): fmts = [] idx = 0 with open(self.source_file, "r") as f: @@ -35,8 +35,7 @@ def get_formats(self, exts, mts, gnrs): # Do the ugly book-keeping to make the SQL work: extensions = list() for extension in set(entry.get('extensions', [])): - exts[extension] = exts.get(extension, Extension(id=extension)) - extensions.append(exts[extension]) + extensions.append(extension) # Set up as a format entity: f = Format( registry=self.registry, diff --git a/foreging/models.py b/foreging/models.py index c12fe183..0aa8a099 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -1,5 +1,5 @@ from datetime import date -from sqlmodel import Field, Relationship, Session, SQLModel, create_engine +from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, JSON, Column class Registry(SQLModel, table=True): @@ -75,22 +75,6 @@ def __hash__(self): def __eq__(self,other): return self.name == other.name -class ExtensionFormatsLink(SQLModel, table=True): - __tablename__ = "format_extensions" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - extension_id: str | None = Field(default=None, foreign_key="extension.id", primary_key=True) - -class Extension(SQLModel, table=True): - id: str | None = Field(default=None, primary_key=True) - # - formats: list["Format"] = Relationship(back_populates="extensions", link_model=ExtensionFormatsLink) - - # Define how to spot unique entries in a set - def __hash__(self): - return hash(self.id) - def __eq__(self,other): - return self.id == other.id - class MediaTypesFormatsLink(SQLModel, table=True): __tablename__ = "format_media_types" format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) @@ -114,7 +98,7 @@ class Format(SQLModel, table=True): version: str | None = Field(index=True) summary: str | None = Field(index=True) genres: list["Genre"] = Relationship(back_populates="formats", link_model=FormatGenresLink) - extensions: list["Extension"] = Relationship(back_populates="formats", link_model=ExtensionFormatsLink) + extensions: list[str] | None = Field(default=None, sa_column=Column(JSON)) media_types: list["MediaType"] = Relationship(back_populates="formats", link_model=MediaTypesFormatsLink) has_magic: bool = Field(default=False) primary_media_type: str | None = Field(index=True) diff --git a/foreging/nara.py b/foreging/nara.py index 3a465594..c1c04cd3 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -3,7 +3,7 @@ from rdflib import Graph, RDF, DCTERMS from rdflib.namespace import DefinedNamespace, Namespace from rdflib.term import URIRef -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logger = logging.getLogger(__name__) @@ -45,7 +45,7 @@ class NARA_FFPP(): id_prefix='https://www.archives.gov/files/lod/dpframework/id/' ) - def get_formats(self, exts, mts, grs): + def get_formats(self, mts, grs): g = Graph() g.parse(self.source_file) @@ -63,8 +63,7 @@ def get_formats(self, exts, mts, grs): extensions = set() for ext in [o for s, p, o in g.triples((s, WDT.p1195, None))]: ext = str(ext) - exts[ext] = exts.get(ext, Extension(id=ext)) - extensions.add(exts[ext]) + extensions.add(ext) genres = [] for genre in [o for s, p, o in g.triples((s, NARA.category, None))]: genre = str(genre) diff --git a/foreging/populate.py b/foreging/populate.py index b32cd6ea..f4baaaf0 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -18,9 +18,9 @@ logger = logging.getLogger(__name__) # Push in the data: -def populate_database(session, gen, exts, mts, genres): +def populate_database(session, gen, mts, genres): logger.info("Getting transformed format records for registry ID %s..." % gen.registry_id) - for f in gen.get_formats(exts, mts, genres): + for f in gen.get_formats(mts, genres): session.add(f) if __name__ == "__main__": @@ -37,7 +37,6 @@ def populate_database(session, gen, exts, mts, genres): # Cache the cross-referenced entities: - exts = {} mts = {} genres = {} @@ -53,7 +52,7 @@ def populate_database(session, gen, exts, mts, genres): for reg_id in registries: reg = registries[reg_id] if args.only == None or args.only == reg_id: - populate_database(session, reg, exts, mts, genres) + populate_database(session, reg, mts, genres) # Every commit should be self-consistent at this point: session.commit() diff --git a/foreging/pronom.py b/foreging/pronom.py index f9f44ade..5bfcc7a9 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -2,7 +2,7 @@ import logging import datetime from bs4 import BeautifulSoup -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -25,7 +25,7 @@ def _date_parser(self, pronom_date): date = datetime.datetime.strptime(pronom_date, "%d %b %Y") return date - def get_formats(self, exts, mts, genres): + def get_formats(self, mts, genres): for source_folder_name in ['fmt', 'x-fmt']: source_folder = os.path.join(self.source_folder, source_folder_name) @@ -62,8 +62,7 @@ def get_formats(self, exts, mts, genres): for fe in root.findAll('ExternalSignature'): if fe.find('SignatureType', string='File extension'): ext = fe.find('Signature').text - exts[ext] = exts.get(ext, Extension(id=ext)) - extensions.append(exts[ext]) + extensions.append(ext) f_extensions = extensions # Get MIME types: mimetypes = list() diff --git a/foreging/tcdb.py b/foreging/tcdb.py index a1e1f8cf..4f76b0e1 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -1,7 +1,7 @@ import os import csv import logging -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -22,7 +22,7 @@ class TCDB(): index_data_url=source_file ) - def get_formats(self, exts, mts, genres): + def get_formats(self, mts, genres): # First, gather rows by type_code... rows_by_type_code = {} # Open, coping with Unicode BOM @@ -50,8 +50,7 @@ def get_formats(self, exts, mts, genres): # ext = row['Extension'].strip().lower() if ext: - exts[ext] = exts.get(ext,Extension(id=ext)) - extensions.add(exts[ext]) + extensions.add(ext) # cat = row['Category'].strip() if cat: diff --git a/foreging/tika.py b/foreging/tika.py index 07664035..c684109e 100644 --- a/foreging/tika.py +++ b/foreging/tika.py @@ -3,7 +3,7 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -24,7 +24,7 @@ class Tika(): index_data_url=index_url ) - def get_formats(self, exts, mts, gnrs): + def get_formats(self, mts, gnrs): fmts = [] with open(self.source_file, "rb") as f: xml = f.read() @@ -58,8 +58,7 @@ def get_formats(self, exts, mts, gnrs): extensions = list() for ext in ff.findall('glob'): extension = ext.get('pattern').replace('*.','') # Strip the glob - exts[extension] = exts.get(extension, Extension(id=extension)) - extensions.append(exts[extension]) + extensions.append(extension) finfo['extensions'] = extensions # Look for MIME Types: mimetypes = list() diff --git a/foreging/trid.py b/foreging/trid.py index 5539b600..badf4e6c 100644 --- a/foreging/trid.py +++ b/foreging/trid.py @@ -4,7 +4,7 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -27,15 +27,14 @@ class TrID(): fmts = [] - def add_format(self, fid, finfo, exts, mts, gnrs): + def add_format(self, fid, finfo, mts, gnrs): media_types = [] for mt in finfo['mimetypes']: mts[mt] = mts.get(mt, MediaType(id=mt)) media_types.append(mts[mt]) extensions = [] for ext in finfo['extensions']: - exts[ext] = exts.get(ext, Extension(id=ext)) - extensions.append(exts[ext]) + extensions.append(ext) # Set up as a format entity: f = Format( registry=self.registry, @@ -60,7 +59,7 @@ def add_format(self, fid, finfo, exts, mts, gnrs): - def get_formats(self, exts, mts, gnrs): + def get_formats(self, mts, gnrs): for filename in os.listdir(f'{self.source_dir}/triddefs_xml'): if filename.endswith(".trid.xml"): # Get Identifier? @@ -84,7 +83,7 @@ def get_formats(self, exts, mts, gnrs): finfo['extensions'] = extensions # Get MIME types: finfo['mimetypes'] = list() - self.add_format(fid, finfo, exts, mts, gnrs) + self.add_format(fid, finfo, mts, gnrs) # Now yield them, so all the log entries get stored too: for f in self.fmts: diff --git a/foreging/wikidata.py b/foreging/wikidata.py index bef0238c..dc6daab8 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -1,6 +1,6 @@ import json import logging -from .models import Format, Software, Registry, Extension, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -26,7 +26,7 @@ class WikiData(): ) - def get_formats(self, exts, mts, genres): + def get_formats(self, mts, genres): with open (self.fmt_source_file, 'r') as f: wd = json.load(f) @@ -58,8 +58,7 @@ def get_formats(self, exts, mts, genres): if key == 'extension' and fmt[key]: # Making sure we reuse the same object for an extension to keep the model consistent: ext = fmt[key] - exts[ext] = exts.get(ext, Extension(id=ext)) - finfo['extensions'].add(exts[ext]) + finfo['extensions'].add(ext) if key == 'mimetype' and fmt[key]: mt = fmt[key] mts[mt] = mts.get(mt, MediaType(id=mt)) From f4ce8938a8be12510273a874862303e594f2c3d3 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 14 Aug 2025 12:56:37 +0100 Subject: [PATCH 43/53] clean up extension sets export. --- foreging/db/extension_sets.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/foreging/db/extension_sets.py b/foreging/db/extension_sets.py index 1338184a..d954d3d8 100644 --- a/foreging/db/extension_sets.py +++ b/foreging/db/extension_sets.py @@ -1,15 +1,23 @@ import json import sqlite3 +import logging from collections import defaultdict + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + con = sqlite3.connect("registries.db") cur = con.cursor() ext_sets = defaultdict(set) +ext_counts = defaultdict(int) for row in cur.execute("SELECT registry_id, format.id, e.value FROM format, json_each(extensions) AS e ORDER BY e.value ASC"): - ext_sets[row[0]].add(row[2]) + ext_sets[row[0]].add(row[2].lower().strip()) + ext_counts[row[0]] += 1 for source, ext_set in ext_sets.items(): ext_sets[source] = list(ext_set) + logger.info(f"Registry {source} has {ext_counts[source]} extensions, of which {len(ext_set)} are unique. Ratio: {ext_counts[source]/len(ext_set)}") print(json.dumps(ext_sets)) \ No newline at end of file From 8d37eaac30757054aa658b6034e26524dbd9fab6 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Thu, 14 Aug 2025 13:08:26 +0100 Subject: [PATCH 44/53] Added CLI and args. --- foreging/db/extension_sets.py | 42 ++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/foreging/db/extension_sets.py b/foreging/db/extension_sets.py index d954d3d8..033e2368 100644 --- a/foreging/db/extension_sets.py +++ b/foreging/db/extension_sets.py @@ -1,23 +1,43 @@ import json import sqlite3 import logging +import argparse from collections import defaultdict logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -con = sqlite3.connect("registries.db") -cur = con.cursor() +def generate_ext_sets(db): + con = sqlite3.connect(db) + + cur = con.cursor() + + ext_sets = defaultdict(set) + ext_counts = defaultdict(int) + for row in cur.execute("SELECT registry_id, format.id, e.value FROM format, json_each(extensions) AS e ORDER BY e.value ASC"): + ext_sets[row[0]].add(row[2].lower().strip()) + ext_counts[row[0]] += 1 + + for source, ext_set in ext_sets.items(): + ext_sets[source] = list(ext_set) + logger.info(f"Registry {source} has {ext_counts[source]} extensions, of which {len(ext_set)} are unique. Ratio: {ext_counts[source]/len(ext_set)}") + return ext_sets, ext_counts + + +if __name__ == "__main__": + # Args setup: + parser = argparse.ArgumentParser() + parser.add_argument('input_db') + parser.add_argument('output_json') + args = parser.parse_args() + + # Query and return the sets of extensions: + ext_sets, ext_counts = generate_ext_sets(args.input_db) + + # Output the sets of extensions: + with open(args.output_json, 'w') as f: + json.dump(ext_sets, f) -ext_sets = defaultdict(set) -ext_counts = defaultdict(int) -for row in cur.execute("SELECT registry_id, format.id, e.value FROM format, json_each(extensions) AS e ORDER BY e.value ASC"): - ext_sets[row[0]].add(row[2].lower().strip()) - ext_counts[row[0]] += 1 -for source, ext_set in ext_sets.items(): - ext_sets[source] = list(ext_set) - logger.info(f"Registry {source} has {ext_counts[source]} extensions, of which {len(ext_set)} are unique. Ratio: {ext_counts[source]/len(ext_set)}") -print(json.dumps(ext_sets)) \ No newline at end of file From 61249ba3751ce13c0a59b02436e2c78c523697e4 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 26 Sep 2025 12:20:43 +0100 Subject: [PATCH 45/53] Simplified to generate JSON first, use a simpler data model. --- digipres.github.io | 2 +- foreging/db/models.py | 117 ++++++++++++++++++++++++++++++++++ foreging/ffw.py | 14 ++-- foreging/file.py | 17 ++--- foreging/linguist.py | 10 ++- foreging/loc_fdd.py | 12 ++-- foreging/mediainfo.py | 6 +- foreging/models.py | 145 ++++++++++++------------------------------ foreging/nara.py | 26 ++++---- foreging/populate.py | 57 ++++++++++------- foreging/pronom.py | 11 ++-- foreging/tcdb.py | 29 ++++----- foreging/tika.py | 13 ++-- foreging/trid.py | 13 ++-- foreging/wikidata.py | 32 +++++----- pyproject.toml | 1 + 16 files changed, 273 insertions(+), 232 deletions(-) create mode 100644 foreging/db/models.py diff --git a/digipres.github.io b/digipres.github.io index 768a4ccf..b44cf47f 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 768a4ccfce0277fbde55d57735988c2dc58c13fe +Subproject commit b44cf47f6bb259f202b4b97775b60ab5957ae412 diff --git a/foreging/db/models.py b/foreging/db/models.py new file mode 100644 index 00000000..0aa8a099 --- /dev/null +++ b/foreging/db/models.py @@ -0,0 +1,117 @@ +from datetime import date +from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, JSON, Column + + +class Registry(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + url: str | None = Field() + id_prefix: str | None = Field() + index_data_url: str | None = Field() + + data_log: list["RegistryDataLogEntry"] = Relationship() + + +class RegistryDataLogEntry(SQLModel, table=True): + __tablename__ = 'registry_data_log' + id: int | None = Field(default=None, primary_key=True) + level: str = Field(index=True) + message: str = Field() + url: str | None = Field() + + registry_id: str | None = Field(default=None, foreign_key="registry.id") + registry: Registry | None = Relationship(back_populates="data_log") + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.message) + def __eq__(self,other): + return self.message == other.message + +class SoftwareReadsFormatLink(SQLModel, table=True): + __tablename__ = "formats_read_by_software" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) + +class SoftwareWritesFormatLink(SQLModel, table=True): + __tablename__ = "formats_written_by_software" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) + +class Software(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + version: str | None = Field(index=True) + summary: str | None = Field(index=True) + license: str | None = Field(index=True) + registry_url: str | None = Field(index=True) + + reads: list["Format"] = Relationship(back_populates="readers", link_model=SoftwareReadsFormatLink) + writes: list["Format"] = Relationship(back_populates="writers", link_model=SoftwareWritesFormatLink) + + registry_id: str | None = Field(default=None, foreign_key="registry.id") + registry: Registry | None = Relationship() + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.id) + def __eq__(self,other): + return self.id == other.id + +class FormatGenresLink(SQLModel, table=True): + __tablename__ = "format_genres" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + genre_id: str | None = Field(default=None, foreign_key="genre.id", primary_key=True) + +class Genre(SQLModel, table=True): + id: int | None = Field(default=None, primary_key=True) + name: str = Field(index=True) + # + formats: list["Format"] = Relationship(back_populates="genres", link_model=FormatGenresLink) + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.name) + def __eq__(self,other): + return self.name == other.name + +class MediaTypesFormatsLink(SQLModel, table=True): + __tablename__ = "format_media_types" + format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) + media_type_id: str | None = Field(default=None, foreign_key="media_type.id", primary_key=True) + +class MediaType(SQLModel, table=True): + __tablename__ = "media_type" + id: str | None = Field(default=None, primary_key=True) + # + formats: list["Format"] = Relationship(back_populates="media_types", link_model=MediaTypesFormatsLink) + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.id) + def __eq__(self,other): + return self.id == other.id + +class Format(SQLModel, table=True): + id: str | None = Field(default=None, primary_key=True) + name: str | None = Field(index=True) + version: str | None = Field(index=True) + summary: str | None = Field(index=True) + genres: list["Genre"] = Relationship(back_populates="formats", link_model=FormatGenresLink) + extensions: list[str] | None = Field(default=None, sa_column=Column(JSON)) + media_types: list["MediaType"] = Relationship(back_populates="formats", link_model=MediaTypesFormatsLink) + has_magic: bool = Field(default=False) + primary_media_type: str | None = Field(index=True) + parent_media_type: str | None = Field(index=True) + registry_url: str | None = Field(index=True) + registry_source_data_url: str | None = Field(index=True) + registry_index_data_url: str | None = Field(index=True) + created: date | None = Field(index=True) + last_modified: date | None = Field(index=True) + + readers: list["Software"] = Relationship(back_populates="reads", link_model=SoftwareReadsFormatLink) + writers: list["Software"] = Relationship(back_populates="writes", link_model=SoftwareWritesFormatLink) + + registry_id: str | None = Field(default=None, foreign_key="registry.id") + registry: Registry | None = Relationship() + diff --git a/foreging/ffw.py b/foreging/ffw.py index cab39b6c..e5a2c033 100644 --- a/foreging/ffw.py +++ b/foreging/ffw.py @@ -1,7 +1,7 @@ import json import yaml import logging -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -20,7 +20,7 @@ class FFW(): index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/{source_file}" ) - def get_formats(self, mts, gnrs): + def get_formats(self): stream = open(self.source_file, 'r') ffw = yaml.safe_load(stream) stream.close() @@ -39,19 +39,15 @@ def get_formats(self, mts, gnrs): ext=ext.lower().strip() f_info['extensions'].add(ext) elif key == 'mimetypes': - for mt in fmt[key]: - mts[mt] = mts.get(mt, MediaType(id=mt)) - f_info['mimetypes'].add(mts[mt]) + f_info['mimetypes'] = fmt[key] elif key == 'categories': - for cat in fmt[key]: - gnrs[cat] = gnrs.get(cat, Genre(name=cat)) - f_info['categories'].add(gnrs[cat]) + f_info['categories'] = fmt[key] else: f_info[key] = fmt[key] # Set up as a format entity: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=ff_id, name=f_info['name'], version=None, diff --git a/foreging/file.py b/foreging/file.py index 0d88aaf2..c73a6118 100644 --- a/foreging/file.py +++ b/foreging/file.py @@ -3,7 +3,7 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -24,7 +24,7 @@ class File(): index_data_url=index_url ) - def get_formats(self, mts, gnrs): + def get_formats(self): fmts = [] idx = 0 with open(self.source_file, "r") as f: @@ -32,17 +32,12 @@ def get_formats(self, mts, gnrs): idx += 1 entry = json.loads(line) if 'extensions' in entry or 'types' in entry: - # Do the ugly book-keeping to make the SQL work: - extensions = list() - for extension in set(entry.get('extensions', [])): - extensions.append(extension) - media_types = [] - for mt in set(entry.get('types', [])): - mts[mt] = mts.get(mt, MediaType(id=mt)) - media_types.append(mts[mt]) + # Remove duplicate entries: + extensions = list(set(entry.get('extensions', []))) + media_types = list(set(entry.get('types', []))) # Set up as a format entity: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=f"{self.registry_id}:{idx}", name=entry["name"], version=None, diff --git a/foreging/linguist.py b/foreging/linguist.py index 9319f512..b186dc4a 100644 --- a/foreging/linguist.py +++ b/foreging/linguist.py @@ -1,7 +1,7 @@ import json import yaml import logging -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -20,7 +20,7 @@ class Linguist(): index_data_url=None ) - def get_formats(self, mts, gnrs): + def get_formats(self): stream = open(self.source_file, 'r') ghl = yaml.safe_load(stream) stream.close() @@ -40,15 +40,13 @@ def get_formats(self, mts, gnrs): ext=ext.strip('.') # Drop the prefix dot f_info['extensions'].add(ext) elif key == 'codemirror_mime_type': - mt = fmt[key] - mts[mt] = mts.get(mt, MediaType(id=mt)) - f_info['mimetypes'].add(mts[mt]) + f_info['mimetypes'] = fmt[key] else: f_info[key] = fmt[key] # Set up as a format entity: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=ff_id, name=f_info['name'], version=None, diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index 27481659..9acb19b1 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -2,7 +2,7 @@ import logging import datetime from bs4 import BeautifulSoup -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -18,7 +18,7 @@ class LocFDD(): id_prefix='https://www.loc.gov/preservation/digital/formats/fdd/' ) - def get_formats(self, mts, genres): + def get_formats(self): fmts = {} @@ -55,7 +55,7 @@ def get_formats(self, mts, genres): f_genres = list() for gns in root.findAll('gdfrGenreSelection'): for gn in gns.findAll('gdfrGenre'): - f_genres.append(Genre(name=f"gdfr:{gn.text}")) + f_genres.append(f"gdfr:{gn.text}") # Haz Magic? if root.find('magicNumbers'): f_magic = True @@ -71,9 +71,7 @@ def get_formats(self, mts, genres): f_mimetypes = set() for imts in root.findAll('internetMediaType'): for mt in imts.findAll('sigValue'): - mt = mt.text - mts[mt] = mts.get(mt, MediaType(id=mt)) - f_mimetypes.add(mts[mt]) + f_mimetypes.add(mt.text) # Find the date: edit_date = root.findAll('date')[-1].text try: @@ -87,7 +85,7 @@ def get_formats(self, mts, genres): # Create record: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=f"{self.registry_id}:{ffd_id}", name=f_name, version=f_version, diff --git a/foreging/mediainfo.py b/foreging/mediainfo.py index 4caa8028..7f9c801c 100644 --- a/foreging/mediainfo.py +++ b/foreging/mediainfo.py @@ -3,7 +3,7 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -24,7 +24,7 @@ class MediaInfo(): index_data_url=index_url ) - def get_formats(self, mts, gnrs): + def get_formats(self): fmts = [] idx = 0 with open(self.source_file, "r") as f: @@ -38,7 +38,7 @@ def get_formats(self, mts, gnrs): extensions.append(extension) # Set up as a format entity: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=f"{self.registry_id}:{idx}", name=entry["name"], version=None, diff --git a/foreging/models.py b/foreging/models.py index 0aa8a099..f5b11a23 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -1,117 +1,52 @@ -from datetime import date -from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, JSON, Column - +import json +from typing import List, Optional, Set, Dict, Tuple, Type, Union, Literal, Annotated +from pydantic import BaseModel, Field +from datetime import datetime, date + +# Data model of normalised form of a format record: +class Format(BaseModel): + id: str | None + name: str | None + version: str | None + summary: str | None + genres: list[str] | None + extensions: list[str] | None + media_types: list[str] + has_magic: bool = Field(default=False) + primary_media_type: str | None = Field(index=True) + parent_media_type: str | None = Field(index=True) + registry_url: str | None = Field(index=True) + registry_source_data_url: str | None = Field(index=True) + registry_index_data_url: str | None = Field(index=True) + created: date | None = Field(index=True) + last_modified: date | None = Field(index=True) -class Registry(SQLModel, table=True): - id: str | None = Field(default=None, primary_key=True) - name: str = Field(index=True) - url: str | None = Field() - id_prefix: str | None = Field() - index_data_url: str | None = Field() + readers: Optional[list[str]] = [] + writers: Optional[list[str]] = [] - data_log: list["RegistryDataLogEntry"] = Relationship() - + registry_id: str | None -class RegistryDataLogEntry(SQLModel, table=True): - __tablename__ = 'registry_data_log' - id: int | None = Field(default=None, primary_key=True) - level: str = Field(index=True) - message: str = Field() - url: str | None = Field() - registry_id: str | None = Field(default=None, foreign_key="registry.id") - registry: Registry | None = Relationship(back_populates="data_log") +# +# And for a Registry +# +class RegistryDataLogEntry(BaseModel): + level: str + message: str + url: Optional[str] = None # Define how to spot unique entries in a set def __hash__(self): return hash(self.message) def __eq__(self,other): return self.message == other.message - -class SoftwareReadsFormatLink(SQLModel, table=True): - __tablename__ = "formats_read_by_software" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) - -class SoftwareWritesFormatLink(SQLModel, table=True): - __tablename__ = "formats_written_by_software" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - software_id: str | None = Field(default=None, foreign_key="software.id", primary_key=True) - -class Software(SQLModel, table=True): - id: str | None = Field(default=None, primary_key=True) - name: str = Field(index=True) - version: str | None = Field(index=True) - summary: str | None = Field(index=True) - license: str | None = Field(index=True) - registry_url: str | None = Field(index=True) - - reads: list["Format"] = Relationship(back_populates="readers", link_model=SoftwareReadsFormatLink) - writes: list["Format"] = Relationship(back_populates="writers", link_model=SoftwareWritesFormatLink) - - registry_id: str | None = Field(default=None, foreign_key="registry.id") - registry: Registry | None = Relationship() - - # Define how to spot unique entries in a set - def __hash__(self): - return hash(self.id) - def __eq__(self,other): - return self.id == other.id - -class FormatGenresLink(SQLModel, table=True): - __tablename__ = "format_genres" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - genre_id: str | None = Field(default=None, foreign_key="genre.id", primary_key=True) - -class Genre(SQLModel, table=True): - id: int | None = Field(default=None, primary_key=True) - name: str = Field(index=True) - # - formats: list["Format"] = Relationship(back_populates="genres", link_model=FormatGenresLink) - - # Define how to spot unique entries in a set - def __hash__(self): - return hash(self.name) - def __eq__(self,other): - return self.name == other.name - -class MediaTypesFormatsLink(SQLModel, table=True): - __tablename__ = "format_media_types" - format_id: str | None = Field(default=None, foreign_key="format.id", primary_key=True) - media_type_id: str | None = Field(default=None, foreign_key="media_type.id", primary_key=True) - -class MediaType(SQLModel, table=True): - __tablename__ = "media_type" - id: str | None = Field(default=None, primary_key=True) - # - formats: list["Format"] = Relationship(back_populates="media_types", link_model=MediaTypesFormatsLink) - - # Define how to spot unique entries in a set - def __hash__(self): - return hash(self.id) - def __eq__(self,other): - return self.id == other.id - -class Format(SQLModel, table=True): - id: str | None = Field(default=None, primary_key=True) - name: str | None = Field(index=True) - version: str | None = Field(index=True) - summary: str | None = Field(index=True) - genres: list["Genre"] = Relationship(back_populates="formats", link_model=FormatGenresLink) - extensions: list[str] | None = Field(default=None, sa_column=Column(JSON)) - media_types: list["MediaType"] = Relationship(back_populates="formats", link_model=MediaTypesFormatsLink) - has_magic: bool = Field(default=False) - primary_media_type: str | None = Field(index=True) - parent_media_type: str | None = Field(index=True) - registry_url: str | None = Field(index=True) - registry_source_data_url: str | None = Field(index=True) - registry_index_data_url: str | None = Field(index=True) - created: date | None = Field(index=True) - last_modified: date | None = Field(index=True) - - readers: list["Software"] = Relationship(back_populates="reads", link_model=SoftwareReadsFormatLink) - writers: list["Software"] = Relationship(back_populates="writes", link_model=SoftwareWritesFormatLink) - registry_id: str | None = Field(default=None, foreign_key="registry.id") - registry: Registry | None = Relationship() +class Registry(BaseModel): + id: str + name: str + url: str + id_prefix: Optional[str] = None + index_data_url: Optional[str] = None + # Log for any issues + data_log: list[RegistryDataLogEntry] = [] diff --git a/foreging/nara.py b/foreging/nara.py index c1c04cd3..8def0ae1 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -3,7 +3,7 @@ from rdflib import Graph, RDF, DCTERMS from rdflib.namespace import DefinedNamespace, Namespace from rdflib.term import URIRef -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry logger = logging.getLogger(__name__) @@ -45,7 +45,7 @@ class NARA_FFPP(): id_prefix='https://www.archives.gov/files/lod/dpframework/id/' ) - def get_formats(self, mts, grs): + def get_formats(self): g = Graph() g.parse(self.source_file) @@ -66,27 +66,23 @@ def get_formats(self, mts, grs): extensions.add(ext) genres = [] for genre in [o for s, p, o in g.triples((s, NARA.category, None))]: - genre = str(genre) - grs[genre] = grs.get(genre, Genre(name=genre)) - genres.append(grs[genre]) + genres.append(str(genre)) media_types = [] for mt in [o for s, p, o in g.triples((s, WDT.p1163, None))]: - mt = str(mt) - mts[mt] = mts.get(mt, MediaType(id=mt)) - media_types.append(mts[mt]) + media_types.append(str(mt)) readers = [] for tools in [o for s, p, o in g.triples((s, NARA.tools, None))]: for tool in str(tools).split(';'): - sw = Software( - registry=self.registry, - id=f"{ff_id}#{len(readers)}", - name=tool.strip() - ) - readers.append(sw) + #sw = Software( + # registry=self.registry, + # id=f"{ff_id}#{len(readers)}", + # name=tool.strip() + #) + readers.append(tool.strip()) # Set up as a format entity: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=ff_id, name=g.value(s, NARA.formatName), version=None, diff --git a/foreging/populate.py b/foreging/populate.py index f4baaaf0..b895da6b 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -10,16 +10,19 @@ from .trid import TrID from .wikidata import WikiData -from sqlmodel import Session, SQLModel, create_engine +from sqlite_utils import Database +import pyarrow as pa +import pyarrow.parquet as pq import argparse import logging +from pathlib import Path logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +log = logging.getLogger(__name__) # Push in the data: def populate_database(session, gen, mts, genres): - logger.info("Getting transformed format records for registry ID %s..." % gen.registry_id) + log.info("Getting transformed format records for registry ID %s..." % gen.registry_id) for f in gen.get_formats(mts, genres): session.add(f) @@ -32,30 +35,40 @@ def populate_database(session, gen, mts, genres): # Args parser = argparse.ArgumentParser() parser.add_argument('--only', required=False, choices=registries.keys()) + parser.add_argument('--jsonl', action=argparse.BooleanOptionalAction) parser.add_argument('output_file') args = parser.parse_args() + # Get the output file: + output_file = Path(args.output_file) - # Cache the cross-referenced entities: - mts = {} - genres = {} - - # Set up the session - sqlite_file_name = args.output_file - sqlite_url = f"sqlite:///{sqlite_file_name}" - - engine = create_engine(sqlite_url, echo=False) - - SQLModel.metadata.create_all(engine) - - with Session(engine).no_autoflush as session: - for reg_id in registries: - reg = registries[reg_id] - if args.only == None or args.only == reg_id: - populate_database(session, reg, mts, genres) - # Every commit should be self-consistent at this point: - session.commit() + # Gather the data: + formats = [] + for reg_id in registries: + reg = registries[reg_id] + if args.only == None or args.only == reg_id: + log.info(f"Parsing data from Registry ID = {reg.registry_id}") + for f in reg.get_formats(): + formats.append(f) + # Generate raw JSONL output + if args.jsonl: + log.info("Generating JSONL export...") + with open( output_file.with_suffix(".jsonl"), "w") as f: + for ir in formats: + f.write(ir.model_dump_json()) + f.write("\n") + # Generate SQLite DB + log.info("Generating SQLite export...") + sql_path = output_file + db = Database(sql_path, recreate=True) + for ir in formats: + db["formats"].insert(ir.model_dump()) + db["formats"].enable_fts(['name', 'version', 'summary', 'genres', 'extensions', 'media_types', 'writers', 'readers']) + log.info("Generating Parquet export...") + plain_records = [item.model_dump() for item in formats] + table = pa.Table.from_pylist(plain_records) + pq.write_table(table, output_file.with_suffix(".parquet")) diff --git a/foreging/pronom.py b/foreging/pronom.py index 5bfcc7a9..ade25870 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -2,7 +2,7 @@ import logging import datetime from bs4 import BeautifulSoup -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -25,7 +25,7 @@ def _date_parser(self, pronom_date): date = datetime.datetime.strptime(pronom_date, "%d %b %Y") return date - def get_formats(self, mts, genres): + def get_formats(self): for source_folder_name in ['fmt', 'x-fmt']: source_folder = os.path.join(self.source_folder, source_folder_name) @@ -50,8 +50,6 @@ def get_formats(self, mts, genres): f_types = [g.strip() for g in f_types] # Replace empty strings with "Undefined": f_types = ['undefined' if not g else g for g in f_types] - # And convert to SQLModel type: - f_types = [Genre(name=g) for g in f_types] # Internal signatures: if root.find('InternalSignature'): f_magic = True @@ -69,12 +67,11 @@ def get_formats(self, mts, genres): for ffi in root.findAll('FileFormatIdentifier'): if ffi.find('IdentifierType', string='MIME'): mt = ffi.find('Identifier').text - mts[mt] = mts.get(mt, MediaType(id=mt)) - mimetypes.append(mts[mt]) + mimetypes.append(mt) f_mimetypes = mimetypes # Create record: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=ffd_id, name=f_name, version=root.find("FormatVersion").text, diff --git a/foreging/tcdb.py b/foreging/tcdb.py index 4f76b0e1..7b359810 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -1,7 +1,7 @@ import os import csv import logging -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -22,7 +22,7 @@ class TCDB(): index_data_url=source_file ) - def get_formats(self, mts, genres): + def get_formats(self): # First, gather rows by type_code... rows_by_type_code = {} # Open, coping with Unicode BOM @@ -54,25 +54,24 @@ def get_formats(self, mts, genres): # cat = row['Category'].strip() if cat: - genres[cat] = genres.get(cat, Genre(name=cat)) - categories.add(genres[cat]) + categories.add(cat) # names.append(row['File Name'].strip()) # Record the Software ID, adding a line number to make sure everything has distinct IDs. sw_id = f"tcdb:{type_code}:{creator_code}#L{row['_line_number']}" - sws[sw_id] = sws.get(sw_id, - Software( - registry=self.registry, - id=sw_id, - name=row['Comments'].strip(), # Software name usually stored in the Comments field. - version=None, - summary=row['File Name'].strip() - ) - ) - readers.append(sws[sw_id]) + #sws[sw_id] = sws.get(sw_id, + # Software( + # registry=self.registry, + # id=sw_id, + # name=row['Comments'].strip(), # Software name usually stored in the Comments field. + # version=None, + # summary= + # ) + #) + readers.append(row['Comments'].strip() + " " + row['File Name'].strip()) # Set up as a format entity for this type_code: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=f"tcdb:{type_code}", name= ", ".join(names)[:256], # FIXME Limit size as this includes too much software information and is very slow to work with! version=None, diff --git a/foreging/tika.py b/foreging/tika.py index c684109e..ebf89fbc 100644 --- a/foreging/tika.py +++ b/foreging/tika.py @@ -3,7 +3,7 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -24,7 +24,7 @@ class Tika(): index_data_url=index_url ) - def get_formats(self, mts, gnrs): + def get_formats(self): fmts = [] with open(self.source_file, "rb") as f: xml = f.read() @@ -91,15 +91,12 @@ def get_formats(self, mts, gnrs): self.registry.data_log.extend(log) # Post-process mimetypes: - media_types = [] - for mt in finfo['mimetypes']: - mts[mt] = mts.get(mt, MediaType(id=mt)) - media_types.append(mts[mt]) + media_types = finfo['mimetypes'] parent = finfo.get('supertype', None) # Set up as a format entity: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=f"{self.registry_id}:{fid}", name=finfo.get('name', None), version=None, @@ -108,7 +105,7 @@ def get_formats(self, mts, gnrs): extensions=list(finfo['extensions']), media_types=media_types, has_magic=finfo['hasMagic'], - primary_media_type=media_types[0].id, + primary_media_type=media_types[0], parent_media_type=parent, registry_url=None, registry_source_data_url=self.source_url + finfo['source'], diff --git a/foreging/trid.py b/foreging/trid.py index badf4e6c..d4086785 100644 --- a/foreging/trid.py +++ b/foreging/trid.py @@ -4,7 +4,7 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -27,17 +27,16 @@ class TrID(): fmts = [] - def add_format(self, fid, finfo, mts, gnrs): + def add_format(self, fid, finfo): media_types = [] for mt in finfo['mimetypes']: - mts[mt] = mts.get(mt, MediaType(id=mt)) - media_types.append(mts[mt]) + media_types.append(mt) extensions = [] for ext in finfo['extensions']: extensions.append(ext) # Set up as a format entity: f = Format( - registry=self.registry, + registry_id=self.registry_id, id=f"{self.registry_id}:{fid}", name=finfo.get('name', None), version=None, @@ -59,7 +58,7 @@ def add_format(self, fid, finfo, mts, gnrs): - def get_formats(self, mts, gnrs): + def get_formats(self): for filename in os.listdir(f'{self.source_dir}/triddefs_xml'): if filename.endswith(".trid.xml"): # Get Identifier? @@ -83,7 +82,7 @@ def get_formats(self, mts, gnrs): finfo['extensions'] = extensions # Get MIME types: finfo['mimetypes'] = list() - self.add_format(fid, finfo, mts, gnrs) + self.add_format(fid, finfo) # Now yield them, so all the log entries get stored too: for f in self.fmts: diff --git a/foreging/wikidata.py b/foreging/wikidata.py index dc6daab8..4ba60b81 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -1,6 +1,6 @@ import json import logging -from .models import Format, Software, Registry, Genre, MediaType, RegistryDataLogEntry +from .models import Format, Registry, RegistryDataLogEntry logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -26,7 +26,7 @@ class WikiData(): ) - def get_formats(self, mts, genres): + def get_formats(self): with open (self.fmt_source_file, 'r') as f: wd = json.load(f) @@ -61,8 +61,7 @@ def get_formats(self, mts, genres): finfo['extensions'].add(ext) if key == 'mimetype' and fmt[key]: mt = fmt[key] - mts[mt] = mts.get(mt, MediaType(id=mt)) - finfo['mimetypes'].add(mts[mt]) + finfo['mimetypes'].add(mt) if key == 'sig' and fmt[key]: finfo['hasMagic'] = True @@ -118,7 +117,7 @@ def make_format(self, current_qid, finfo): # Set up as a format entity: f = Format( id=f"{current_qid}", - registry=self.registry, + registry_id=self.registry_id, name=finfo['name'], version=None, summary=None, @@ -142,16 +141,17 @@ def make_format(self, current_qid, finfo): def make_software(self, info): - s = Software( - registry_id=self.registry_id, - id=f"wikidata:{info['id']}", - name=info['name'], - version=None, - summary=None, - registry_url=info['source'], - license=info['licenseLabel'], - ) - logger.debug(f"Generated software: {s}") - return s + #s = Software( + # registry_id=self.registry_id, + # id=f"wikidata:{info['id']}", + # name=info['name'], + # version=None, + # summary=None, + # registry_url=info['source'], + # license=info['licenseLabel'], + #) + #logger.debug(f"Generated software: {s}") + #return s + return info['name'] diff --git a/pyproject.toml b/pyproject.toml index 882e0d25..8a011b79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "lxml", "rdflib", "polyfile", + "pyarrow", "pydantic", "sqlite-utils", "sqlmodel" From b252615fc819de25ba1a45f1090d07a5d2cd6902 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 26 Sep 2025 13:39:46 +0100 Subject: [PATCH 46/53] Fixed up PK/FK relationships. --- .gitignore | 1 + Makefile | 14 ++++---------- foreging/populate.py | 44 +++++++++++++++++++++++++++++++------------- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 4292fdd5..23d8fdeb 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ /registries.db /.venv /build +/data diff --git a/Makefile b/Makefile index d60d52dc..f0e75640 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,8 @@ -all: registries.db +all: data/registries.db -registries.db: foreging/*.py - rm -f $@ $@.tmp +data/registries.db: foreging/*.py + rm -fr data mkdir -p data - python -m foreging.populate $@.tmp - sqlite-utils enable-fts $@.tmp format name version summary extensions - sqlite-utils enable-fts $@.tmp media_type id - sqlite-utils enable-fts $@.tmp genre name - sqlite-utils enable-fts $@.tmp software name version summary - sqlite-utils enable-fts $@.tmp registry_data_log level message - mv $@.tmp $@ + python -m foreging.populate data diff --git a/foreging/populate.py b/foreging/populate.py index b895da6b..35399288 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -36,11 +36,15 @@ def populate_database(session, gen, mts, genres): parser = argparse.ArgumentParser() parser.add_argument('--only', required=False, choices=registries.keys()) parser.add_argument('--jsonl', action=argparse.BooleanOptionalAction) - parser.add_argument('output_file') + parser.add_argument('output_path') args = parser.parse_args() # Get the output file: - output_file = Path(args.output_file) + output_path= Path(args.output_path) + if not output_path.exists(): + output_path.mkdir() + elif output_path.is_file(): + raise Exception("Output path should not be a file!") # Gather the data: formats = [] @@ -51,24 +55,38 @@ def populate_database(session, gen, mts, genres): for f in reg.get_formats(): formats.append(f) + # Define outputs: + outputs = { + "formats": formats, + "registries": [registries[id].registry for id in registries] + } + # Generate raw JSONL output if args.jsonl: log.info("Generating JSONL export...") - with open( output_file.with_suffix(".jsonl"), "w") as f: - for ir in formats: - f.write(ir.model_dump_json()) - f.write("\n") + for name, records in outputs.items(): + with open( output_path / f"{name}.jsonl", "w") as f: + for r in records: + f.write(r.model_dump_json()) + f.write("\n") + + # And generate Parquet version: + log.info("Generating Parquet export...") + for name, records in outputs.items(): + plain_records = [item.model_dump() for item in records] + table = pa.Table.from_pylist(plain_records) + pq.write_table(table, output_path / f"{name}.parquet") # Generate SQLite DB log.info("Generating SQLite export...") - sql_path = output_file + sql_path = output_path / "registries.db" db = Database(sql_path, recreate=True) - for ir in formats: - db["formats"].insert(ir.model_dump()) + for r in formats: + db["formats"].insert(r.model_dump(), pk="id") + for r in outputs["registries"]: + db["registries"].insert(r.model_dump(), pk="id") db["formats"].enable_fts(['name', 'version', 'summary', 'genres', 'extensions', 'media_types', 'writers', 'readers']) + # sqlite-utils add-foreign-key data/registries.db formats registry_id registries id + db["formats"].add_foreign_key('registry_id', 'registries', 'id') - log.info("Generating Parquet export...") - plain_records = [item.model_dump() for item in formats] - table = pa.Table.from_pylist(plain_records) - pq.write_table(table, output_file.with_suffix(".parquet")) From 829c95883b8f1a57013bc9a22fd7af09aff554e7 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 26 Sep 2025 16:29:50 +0100 Subject: [PATCH 47/53] Now outputs multiple sorted Parquet files and JSONL works too. --- Makefile | 6 +++--- foreging/populate.py | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index f0e75640..d66b7c7d 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ -all: data/registries.db +all: registries.db -data/registries.db: foreging/*.py +registries.db: foreging/*.py rm -fr data mkdir -p data - python -m foreging.populate data + python -m foreging.populate --json data diff --git a/foreging/populate.py b/foreging/populate.py index 35399288..a16ff5a6 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -9,13 +9,16 @@ from .tika import Tika from .trid import TrID from .wikidata import WikiData +from .models import Format +from pydantic import BaseModel from sqlite_utils import Database import pyarrow as pa import pyarrow.parquet as pq import argparse import logging from pathlib import Path +import json logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) @@ -55,10 +58,26 @@ def populate_database(session, gen, mts, genres): for f in reg.get_formats(): formats.append(f) + # Generate extensions lookup dataset, sorted by extension to hopefully make it faster: + ext_to_fmt = {} + for f in formats: + f: Format + for ext in f.extensions: + entries: list = ext_to_fmt.get( ext, [] ) + entries.append(f) + ext_to_fmt[ext] = entries + extensions = [] + for ext,fmts in sorted(ext_to_fmt.items()): + extensions.append({ + 'id': ext, + 'format_ids': [f.id for f in fmts] + }) + # Define outputs: outputs = { + "registries": [registries[id].registry for id in registries], "formats": formats, - "registries": [registries[id].registry for id in registries] + "extensions": extensions } # Generate raw JSONL output @@ -67,15 +86,27 @@ def populate_database(session, gen, mts, genres): for name, records in outputs.items(): with open( output_path / f"{name}.jsonl", "w") as f: for r in records: - f.write(r.model_dump_json()) + if isinstance(r, BaseModel): + f.write(r.model_dump_json()) + else: + f.write(json.dumps(r)) f.write("\n") # And generate Parquet version: log.info("Generating Parquet export...") for name, records in outputs.items(): - plain_records = [item.model_dump() for item in records] + plain_records = [] + for item in records: + if isinstance(item, BaseModel): + plain_records.append(item.model_dump()) + else: + plain_records.append(item) table = pa.Table.from_pylist(plain_records) - pq.write_table(table, output_path / f"{name}.parquet") + # Sort the records by the ID field and note this in the Parquet: + sort_order = [('id', 'ascending')] + table = table.sort_by(sort_order) + sorting_columns = pq.SortingColumn.from_ordering(table.schema, sort_order) + pq.write_table(table, output_path / f"{name}.parquet", write_page_index=True, sorting_columns=sorting_columns) # Generate SQLite DB log.info("Generating SQLite export...") From bec5459f727ac1010f06aa2e1bb41fb1077ea0fa Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 26 Sep 2025 18:22:46 +0100 Subject: [PATCH 48/53] Added ABC for a bit more rigor. --- foreging/ffw.py | 4 ++-- foreging/file.py | 4 ++-- foreging/linguist.py | 4 ++-- foreging/loc_fdd.py | 4 ++-- foreging/mediainfo.py | 4 ++-- foreging/models.py | 17 +++++++++++++++-- foreging/nara.py | 4 ++-- foreging/pronom.py | 4 ++-- foreging/tcdb.py | 4 ++-- foreging/tika.py | 4 ++-- foreging/trid.py | 4 ++-- foreging/wikidata.py | 4 ++-- 12 files changed, 37 insertions(+), 24 deletions(-) diff --git a/foreging/ffw.py b/foreging/ffw.py index e5a2c033..46fbdf88 100644 --- a/foreging/ffw.py +++ b/foreging/ffw.py @@ -1,14 +1,14 @@ import json import yaml import logging -from .models import Format, Registry +from .models import Format, Registry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # -class FFW(): +class FFW(RegistryClient): source_file = "digipres.github.io/_sources/registries/mediawikis/ffw.yml" # Set up the Registry object for this class: registry_id = "ffw" diff --git a/foreging/file.py b/foreging/file.py index c73a6118..53bdb65b 100644 --- a/foreging/file.py +++ b/foreging/file.py @@ -3,14 +3,14 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Registry +from .models import Format, Registry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # -class File(): +class File(RegistryClient): source_file = "digipres.github.io/_sources/registries/file/polyfile-magic.jsonl" source_url = "https://github.com/trailofbits/polyfile/tree/master/polyfile/magic_defs" index_url = "https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/file/polyfile-magic.jsonl" diff --git a/foreging/linguist.py b/foreging/linguist.py index b186dc4a..8b6ab05d 100644 --- a/foreging/linguist.py +++ b/foreging/linguist.py @@ -1,14 +1,14 @@ import json import yaml import logging -from .models import Format, Registry +from .models import Format, Registry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # -class Linguist(): +class Linguist(RegistryClient): source_file = "digipres.github.io/_sources/registries/githublinguist/languages.yml" # Set up the Registry object for this class: registry_id = "linguist" diff --git a/foreging/loc_fdd.py b/foreging/loc_fdd.py index 9acb19b1..b5b589bd 100644 --- a/foreging/loc_fdd.py +++ b/foreging/loc_fdd.py @@ -2,12 +2,12 @@ import logging import datetime from bs4 import BeautifulSoup -from .models import Format, Registry, RegistryDataLogEntry +from .models import Format, Registry, RegistryDataLogEntry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -class LocFDD(): +class LocFDD(RegistryClient): registry_id = "lcfdd" source_folder = 'digipres.github.io/_sources/registries/fdd/fddXML' show_parsed_xml_on_errors = False diff --git a/foreging/mediainfo.py b/foreging/mediainfo.py index 7f9c801c..4b0a9d4d 100644 --- a/foreging/mediainfo.py +++ b/foreging/mediainfo.py @@ -3,14 +3,14 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Registry +from .models import Format, Registry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # -class MediaInfo(): +class MediaInfo(RegistryClient): source_file = "digipres.github.io/_sources/registries/mediainfo/mediainfo.jsonl" source_url = "https://mediaarea.net/en/MediaInfo/Support/Formats" index_url = "https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/mediainfo/mediainfo.jsonl" diff --git a/foreging/models.py b/foreging/models.py index f5b11a23..6751191e 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -1,9 +1,12 @@ import json -from typing import List, Optional, Set, Dict, Tuple, Type, Union, Literal, Annotated +from typing import List, Optional, Set, Dict, Tuple, Type, Union, Literal, Annotated, Iterator from pydantic import BaseModel, Field from datetime import datetime, date +from abc import ABC, abstractmethod +# # Data model of normalised form of a format record: +# class Format(BaseModel): id: str | None name: str | None @@ -28,7 +31,7 @@ class Format(BaseModel): # -# And for a Registry +# And for a Registry: # class RegistryDataLogEntry(BaseModel): level: str @@ -50,3 +53,13 @@ class Registry(BaseModel): # Log for any issues data_log: list[RegistryDataLogEntry] = [] + +# +# An Abstract Base Class for the client code: +# +class RegistryClient(ABC): + + @abstractmethod + def get_formats(self) -> Iterator[Format]: + ... + diff --git a/foreging/nara.py b/foreging/nara.py index 8def0ae1..64a8ae92 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -3,7 +3,7 @@ from rdflib import Graph, RDF, DCTERMS from rdflib.namespace import DefinedNamespace, Namespace from rdflib.term import URIRef -from .models import Format, Registry +from .models import Format, Registry, RegistryClient logger = logging.getLogger(__name__) @@ -35,7 +35,7 @@ class WDT(DefinedNamespace): # # NARA File Format Preservation Plan parser # -class NARA_FFPP(): +class NARA_FFPP(RegistryClient): registry_id = "naradpf" source_file = 'digipres.github.io/_sources/registries/nara/fileformats.ttl' registry = Registry( diff --git a/foreging/pronom.py b/foreging/pronom.py index ade25870..ca9a6359 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -2,12 +2,12 @@ import logging import datetime from bs4 import BeautifulSoup -from .models import Format, Registry +from .models import Format, Registry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -class PRONOM(): +class PRONOM(RegistryClient): registry_id = "pronom" source_folder = 'digipres.github.io/_sources/registries/pronom/' warnings = [] diff --git a/foreging/tcdb.py b/foreging/tcdb.py index 7b359810..5f9f7c6a 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -1,7 +1,7 @@ import os import csv import logging -from .models import Format, Registry +from .models import Format, Registry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -10,7 +10,7 @@ # # TCDB CSV dump parser # -class TCDB(): +class TCDB(RegistryClient): registry_id = "tcdb" registry_url = f"https://github.com/thorsted/Born-Digital-Scripts/tree/main/TC%20Identification" source_file = 'digipres.github.io/_sources/registries/tcdb/TCDB_2003.8_data-cleaned.csv' diff --git a/foreging/tika.py b/foreging/tika.py index ebf89fbc..06dc9dcb 100644 --- a/foreging/tika.py +++ b/foreging/tika.py @@ -3,14 +3,14 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Registry, RegistryDataLogEntry +from .models import Format, Registry, RegistryDataLogEntry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # -class Tika(): +class Tika(RegistryClient): source_file = "digipres.github.io/_sources/registries/tika/tika-mimetypes.xml" source_url = "https://svn.apache.org/repos/asf/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml" index_url = "https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/tika/tika-mimetypes.xml" diff --git a/foreging/trid.py b/foreging/trid.py index d4086785..f0732b34 100644 --- a/foreging/trid.py +++ b/foreging/trid.py @@ -4,14 +4,14 @@ from lxml import etree from io import BytesIO import logging -from .models import Format, Registry +from .models import Format, Registry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # -class TrID(): +class TrID(RegistryClient): source_dir = "digipres.github.io/_sources/registries/trid" source_url = "" index_url = "https://github.com/digipres/digipres.github.io/tree/master/_sources/registries/trid/triddefs_xml/" diff --git a/foreging/wikidata.py b/foreging/wikidata.py index 4ba60b81..8a4baabb 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -1,6 +1,6 @@ import json import logging -from .models import Format, Registry, RegistryDataLogEntry +from .models import Format, Registry, RegistryDataLogEntry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -9,7 +9,7 @@ # # WikiData dumps parser # -class WikiData(): +class WikiData(RegistryClient): source_file_dir = "digipres.github.io/_sources/registries/wikidata" fmt_source_file = f"{source_file_dir}/wikidata.json" sw_r_source_file = f"{source_file_dir}/wikidata-reads.json" From 6aa8718f82fd674574d8a190872b593f819c6151 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 26 Sep 2025 18:35:44 +0100 Subject: [PATCH 49/53] Added back in full Software objects, nested only for now. --- foreging/models.py | 21 +++++++++++++++++++-- foreging/nara.py | 14 +++++++------- foreging/populate.py | 5 +++-- foreging/tcdb.py | 22 +++++++++++----------- foreging/wikidata.py | 25 ++++++++++++------------- 5 files changed, 52 insertions(+), 35 deletions(-) diff --git a/foreging/models.py b/foreging/models.py index 6751191e..077fff1e 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -4,6 +4,23 @@ from datetime import datetime, date from abc import ABC, abstractmethod +# +# A Software record +# +class Software(BaseModel): + id: str + name: str + version: Optional[str] = None + summary: Optional[str] = None + license: Optional[str] = None + registry_url: Optional[str] = None + + # Define how to spot unique entries in a set + def __hash__(self): + return hash(self.id) + def __eq__(self,other): + return self.id == other.id + # # Data model of normalised form of a format record: # @@ -24,8 +41,8 @@ class Format(BaseModel): created: date | None = Field(index=True) last_modified: date | None = Field(index=True) - readers: Optional[list[str]] = [] - writers: Optional[list[str]] = [] + readers: Optional[list[Software]] = [] + writers: Optional[list[Software]] = [] registry_id: str | None diff --git a/foreging/nara.py b/foreging/nara.py index 64a8ae92..7c5876b0 100644 --- a/foreging/nara.py +++ b/foreging/nara.py @@ -3,7 +3,7 @@ from rdflib import Graph, RDF, DCTERMS from rdflib.namespace import DefinedNamespace, Namespace from rdflib.term import URIRef -from .models import Format, Registry, RegistryClient +from .models import Software, Format, Registry, RegistryClient logger = logging.getLogger(__name__) @@ -73,12 +73,12 @@ def get_formats(self): readers = [] for tools in [o for s, p, o in g.triples((s, NARA.tools, None))]: for tool in str(tools).split(';'): - #sw = Software( - # registry=self.registry, - # id=f"{ff_id}#{len(readers)}", - # name=tool.strip() - #) - readers.append(tool.strip()) + sw = Software( + registry=self.registry, + id=f"{ff_id}#{len(readers)}", + name=tool.strip() + ) + readers.append(sw) # Set up as a format entity: f = Format( diff --git a/foreging/populate.py b/foreging/populate.py index a16ff5a6..f0434e5a 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -9,7 +9,7 @@ from .tika import Tika from .trid import TrID from .wikidata import WikiData -from .models import Format +from .models import Format, RegistryClient from pydantic import BaseModel from sqlite_utils import Database @@ -19,6 +19,7 @@ import logging from pathlib import Path import json +from typing import Dict logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) @@ -31,7 +32,7 @@ def populate_database(session, gen, mts, genres): if __name__ == "__main__": # Registries - registries = {} + registries: Dict[str,RegistryClient] = {} for r in [File(), FFW(), Linguist(), LocFDD(), MediaInfo(), NARA_FFPP(), PRONOM(), TCDB(), Tika(), TrID(), WikiData()]: registries[r.registry.id] = r diff --git a/foreging/tcdb.py b/foreging/tcdb.py index 5f9f7c6a..191bf43d 100644 --- a/foreging/tcdb.py +++ b/foreging/tcdb.py @@ -1,7 +1,7 @@ import os import csv import logging -from .models import Format, Registry, RegistryClient +from .models import Software, Format, Registry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -59,16 +59,16 @@ def get_formats(self): names.append(row['File Name'].strip()) # Record the Software ID, adding a line number to make sure everything has distinct IDs. sw_id = f"tcdb:{type_code}:{creator_code}#L{row['_line_number']}" - #sws[sw_id] = sws.get(sw_id, - # Software( - # registry=self.registry, - # id=sw_id, - # name=row['Comments'].strip(), # Software name usually stored in the Comments field. - # version=None, - # summary= - # ) - #) - readers.append(row['Comments'].strip() + " " + row['File Name'].strip()) + sws[sw_id] = sws.get(sw_id, + Software( + registry=self.registry, + id=sw_id, + name=row['Comments'].strip(), # Software name usually stored in the Comments field. + version=None, + summary=row['File Name'].strip() + ) + ) + readers.append(sws[sw_id]) # Set up as a format entity for this type_code: f = Format( registry_id=self.registry_id, diff --git a/foreging/wikidata.py b/foreging/wikidata.py index 8a4baabb..b076573b 100644 --- a/foreging/wikidata.py +++ b/foreging/wikidata.py @@ -1,6 +1,6 @@ import json import logging -from .models import Format, Registry, RegistryDataLogEntry, RegistryClient +from .models import Software, Format, Registry, RegistryDataLogEntry, RegistryClient logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -141,17 +141,16 @@ def make_format(self, current_qid, finfo): def make_software(self, info): - #s = Software( - # registry_id=self.registry_id, - # id=f"wikidata:{info['id']}", - # name=info['name'], - # version=None, - # summary=None, - # registry_url=info['source'], - # license=info['licenseLabel'], - #) - #logger.debug(f"Generated software: {s}") - #return s - return info['name'] + s = Software( + registry_id=self.registry_id, + id=f"wikidata:{info['id']}", + name=info['name'], + version=None, + summary=None, + registry_url=info['source'], + license=info['licenseLabel'], + ) + logger.debug(f"Generated software: {s}") + return s From bc76991eac4e84179c663136013daf0c5eb4e5d0 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 26 Sep 2025 19:32:55 +0100 Subject: [PATCH 50/53] Add more outputs. --- derive.sh | 3 ++- digipres.github.io | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/derive.sh b/derive.sh index 7a0fc294..5f9ce498 100755 --- a/derive.sh +++ b/derive.sh @@ -5,5 +5,6 @@ source venv/bin/activate make -cp registries.db digipres.github.io/_data/formats/ +cp data/registries.db digipres.github.io/_data/formats/ +cp data/*.parquet digipres.github.io/_data/formats/index diff --git a/digipres.github.io b/digipres.github.io index b44cf47f..1a6e3b2f 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit b44cf47f6bb259f202b4b97775b60ab5957ae412 +Subproject commit 1a6e3b2f641abfff19cb7861a1fbeb355321f4d0 From b737500c54a12a340ec085751fe3dbe4b30e0471 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 26 Sep 2025 22:37:23 +0100 Subject: [PATCH 51/53] Adding some experimental code for talking to COPTR. --- foreging/coptr.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++ foreging/models.py | 18 ++++++------ pyproject.toml | 2 ++ 3 files changed, 80 insertions(+), 9 deletions(-) create mode 100644 foreging/coptr.py diff --git a/foreging/coptr.py b/foreging/coptr.py new file mode 100644 index 00000000..e6c91f46 --- /dev/null +++ b/foreging/coptr.py @@ -0,0 +1,69 @@ +import mwclient as mw +from mwclient.listing import Category, PageList +import mwparserfromhell + +from .models import Software + +import logging +logging.basicConfig(level=logging.WARNING) + + +coptr_host = 'coptr.digipres.org' +user_agent = 'DigiPresFormatIndexClient/0.1 (andrew.jackson@dpconline.org)' +site = mw.Site(coptr_host, path='/', clients_useragent=user_agent) + +#for tool_page in site.allpages(): +# pass + +#category = site.categories[u"Tool Grid"] +#for page in category: +# print(page.name) + + +# {{Infobox tool +# |image=JHOVE.gif +# |purpose=JHOVE provides functions to perform format-specific identification, validation, and characterization of digital objects. +# |homepage=http://jhove.openpreservation.org/ +# |license=GNU Lesser General Public License (LGPL) +# |platforms=JHOVE should be usable on any UNIX, Windows, or OS X platform with an appropriate J2SE installation. It should run on any operating system that supports Java 1.5 and has a directory-based file system. +# |formats_in=EPUB, GIF, JP2, JPEG, PDF, PNG, PREMIS (Preservation Metadata Implementation Strategies), TIFF, WARC, XML, AIFF, WAVE, GZIP, ASCII, UTF-8, HTML, MP3 +# |function=Encryption Detection, File Format Identification, Metadata Extraction, Validation +# }} + +category: PageList = site.categories[u"Tools"] +for page in category: + print(page.name) + text = page.text() + wikicode = mwparserfromhell.parse(text) + templates = wikicode.filter_templates(matches='infobox tool') + template = templates[0] + formats = template.get("formats_in", None) + if formats: + formats = [f.strip() for f in formats.value.split(",")] + print(f" < {formats}") + formats = template.get("formats_out", None) + if formats: + formats = [f.strip() for f in formats.value.split(",")] + print(f" > {formats}") + print(page.pageid) + if isinstance(page, Category): + for member in page.members(): + print(f"{page.name} > {member.name}") + else: + pass + s = Software( + id=f"coptr:pageid:{page.pageid}", + name=page.name, + version=None, + license=None, + registry_url=f"https://{coptr_host}/Special:Redirect/page/{page.pageid}" + ) + license = template.get('license', None) + if license: + s.license = license.value.strip() + print(s) + + + +# Workflows in Workflow namespace +# Formats is another potential category, but needs patching in via external IDs. \ No newline at end of file diff --git a/foreging/models.py b/foreging/models.py index 077fff1e..cc21f432 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -1,6 +1,6 @@ import json from typing import List, Optional, Set, Dict, Tuple, Type, Union, Literal, Annotated, Iterator -from pydantic import BaseModel, Field +from pydantic import BaseModel from datetime import datetime, date from abc import ABC, abstractmethod @@ -32,14 +32,14 @@ class Format(BaseModel): genres: list[str] | None extensions: list[str] | None media_types: list[str] - has_magic: bool = Field(default=False) - primary_media_type: str | None = Field(index=True) - parent_media_type: str | None = Field(index=True) - registry_url: str | None = Field(index=True) - registry_source_data_url: str | None = Field(index=True) - registry_index_data_url: str | None = Field(index=True) - created: date | None = Field(index=True) - last_modified: date | None = Field(index=True) + has_magic: bool + primary_media_type: str | None = None + parent_media_type: str | None = None + registry_url: str | None = None + registry_source_data_url: str | None = None + registry_index_data_url: str | None = None + created: date | None = None + last_modified: date | None = None readers: Optional[list[Software]] = [] writers: Optional[list[Software]] = [] diff --git a/pyproject.toml b/pyproject.toml index 8a011b79..2bc7329b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,8 @@ dependencies = [ "pyyaml", "beautifulsoup4", "lxml", + "mwclient", + "mwparserfromhell", "rdflib", "polyfile", "pyarrow", From d289aa3e249cec57a8292b33dcff0e5dc30b5c8a Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 26 Sep 2025 23:18:58 +0100 Subject: [PATCH 52/53] Now generates extension sets for each registry --- foreging/models.py | 7 +++++++ foreging/populate.py | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/foreging/models.py b/foreging/models.py index cc21f432..4f101ab7 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -69,6 +69,8 @@ class Registry(BaseModel): index_data_url: Optional[str] = None # Log for any issues data_log: list[RegistryDataLogEntry] = [] + # The set of extensions known to this registry: + extensions: Set[str] = None # @@ -76,6 +78,11 @@ class Registry(BaseModel): # class RegistryClient(ABC): + @property + @abstractmethod + def registry(self) -> Registry: + pass + @abstractmethod def get_formats(self) -> Iterator[Format]: ... diff --git a/foreging/populate.py b/foreging/populate.py index f0434e5a..761eedc3 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -20,6 +20,7 @@ from pathlib import Path import json from typing import Dict +from collections import defaultdict logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) @@ -54,10 +55,13 @@ def populate_database(session, gen, mts, genres): formats = [] for reg_id in registries: reg = registries[reg_id] + reg.registry.extensions = set() if args.only == None or args.only == reg_id: log.info(f"Parsing data from Registry ID = {reg.registry_id}") for f in reg.get_formats(): formats.append(f) + for ext in f.extensions: + reg.registry.extensions.add(ext.lower()) # Generate extensions lookup dataset, sorted by extension to hopefully make it faster: ext_to_fmt = {} From e11ee6c1ce8673511939435549eb1fb1597cce23 Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Fri, 3 Oct 2025 12:56:58 +0100 Subject: [PATCH 53/53] Add TFFH source. Add support for release dates. Fix error in Linguist content types. --- Makefile | 5 ++-- digipres.github.io | 2 +- foreging/coptr.py | 3 +++ foreging/ffw.py | 11 +++++++++ foreging/linguist.py | 2 +- foreging/models.py | 6 ++++- foreging/populate.py | 5 +++- foreging/pronom.py | 6 +++++ foreging/tffh.py | 59 ++++++++++++++++++++++++++++++++++++++++++++ run-in-datasette.sh | 2 ++ 10 files changed, 95 insertions(+), 6 deletions(-) create mode 100644 foreging/tffh.py create mode 100755 run-in-datasette.sh diff --git a/Makefile b/Makefile index d66b7c7d..8d53fe8f 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,9 @@ all: registries.db -registries.db: foreging/*.py +registries.db: foreging/*.py digipres.github.io/_sources/registries/* rm -fr data mkdir -p data python -m foreging.populate --json data - + cp data/registries.db digipres.github.io/_data/formats/registries.db + cp data/*.parquet digipres.github.io/_data/formats/index/ diff --git a/digipres.github.io b/digipres.github.io index 1a6e3b2f..35f8c1ae 160000 --- a/digipres.github.io +++ b/digipres.github.io @@ -1 +1 @@ -Subproject commit 1a6e3b2f641abfff19cb7861a1fbeb355321f4d0 +Subproject commit 35f8c1ae870ac2aafab8f921753a2c682c4ace24 diff --git a/foreging/coptr.py b/foreging/coptr.py index e6c91f46..bb9b8f11 100644 --- a/foreging/coptr.py +++ b/foreging/coptr.py @@ -30,6 +30,9 @@ # |function=Encryption Detection, File Format Identification, Metadata Extraction, Validation # }} +# FIXME this does both at once! One should write the page info needed to JSON. The other should use it. +# But, we don't know everything we need yet, I guess? + category: PageList = site.categories[u"Tools"] for page in category: print(page.name) diff --git a/foreging/ffw.py b/foreging/ffw.py index 46fbdf88..d2f01ea9 100644 --- a/foreging/ffw.py +++ b/foreging/ffw.py @@ -44,6 +44,16 @@ def get_formats(self): f_info['categories'] = fmt[key] else: f_info[key] = fmt[key] + # Released... + released_in: str = fmt.get('released', None) + # Drop empty entries: + if released_in == '': + released_in = None + # Drop any .... content: + if released_in: + ref_index = released_in.find(" -1: + released_in = released_in[0:ref_index] # Set up as a format entity: f = Format( @@ -58,6 +68,7 @@ def get_formats(self): has_magic=f_info['hasMagic'], primary_media_type=None, parent_media_type=None, + released_in=released_in, registry_url=fmt['source'], registry_source_data_url=fmt['source'], registry_index_data_url=None, diff --git a/foreging/linguist.py b/foreging/linguist.py index 8b6ab05d..b945f34e 100644 --- a/foreging/linguist.py +++ b/foreging/linguist.py @@ -40,7 +40,7 @@ def get_formats(self): ext=ext.strip('.') # Drop the prefix dot f_info['extensions'].add(ext) elif key == 'codemirror_mime_type': - f_info['mimetypes'] = fmt[key] + f_info['mimetypes'].add(fmt[key]) else: f_info[key] = fmt[key] diff --git a/foreging/models.py b/foreging/models.py index 4f101ab7..a412afcc 100644 --- a/foreging/models.py +++ b/foreging/models.py @@ -35,9 +35,13 @@ class Format(BaseModel): has_magic: bool primary_media_type: str | None = None parent_media_type: str | None = None + # When this format was published or became available for use (ideally the release year): + released_in: str | None = None + # Tracing back to the source registry: registry_url: str | None = None registry_source_data_url: str | None = None registry_index_data_url: str | None = None + # The created and modified dates for this record about a format created: date | None = None last_modified: date | None = None @@ -70,7 +74,7 @@ class Registry(BaseModel): # Log for any issues data_log: list[RegistryDataLogEntry] = [] # The set of extensions known to this registry: - extensions: Set[str] = None + extensions: list[str] = None # diff --git a/foreging/populate.py b/foreging/populate.py index 761eedc3..3c411355 100644 --- a/foreging/populate.py +++ b/foreging/populate.py @@ -6,6 +6,7 @@ from .nara import NARA_FFPP from .pronom import PRONOM from .tcdb import TCDB +from .tffh import TFFH from .tika import Tika from .trid import TrID from .wikidata import WikiData @@ -34,7 +35,7 @@ def populate_database(session, gen, mts, genres): if __name__ == "__main__": # Registries registries: Dict[str,RegistryClient] = {} - for r in [File(), FFW(), Linguist(), LocFDD(), MediaInfo(), NARA_FFPP(), PRONOM(), TCDB(), Tika(), TrID(), WikiData()]: + for r in [File(), FFW(), Linguist(), LocFDD(), MediaInfo(), NARA_FFPP(), PRONOM(), TCDB(), TFFH(), Tika(), TrID(), WikiData()]: registries[r.registry.id] = r # Args @@ -62,6 +63,8 @@ def populate_database(session, gen, mts, genres): formats.append(f) for ext in f.extensions: reg.registry.extensions.add(ext.lower()) + # Convert set to list: + reg.registry.extensions = list(reg.registry.extensions) # Generate extensions lookup dataset, sorted by extension to hopefully make it faster: ext_to_fmt = {} diff --git a/foreging/pronom.py b/foreging/pronom.py index ca9a6359..6d9875b1 100644 --- a/foreging/pronom.py +++ b/foreging/pronom.py @@ -69,6 +69,11 @@ def get_formats(self): mt = ffi.find('Identifier').text mimetypes.append(mt) f_mimetypes = mimetypes + # Release date as year (source format is '24 Dec 1999' if not empty/whitespace): + release_year = root.find("ReleaseDate").text.strip() + if release_year: + release_year = datetime.datetime.strptime(release_year, "%d %b %Y") + release_year = str(release_year.year) # Create record: f = Format( registry_id=self.registry_id, @@ -82,6 +87,7 @@ def get_formats(self): has_magic=f_magic, primary_media_type=None, parent_media_type=None, + released_in=release_year, registry_url=f"https://www.nationalarchives.gov.uk/pronom/{ffd_id}", registry_source_data_url=f"https://www.nationalarchives.gov.uk/pronom/{ffd_id}.xml", registry_index_data_url=f"https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/pronom/{ffd_id}.xml", diff --git a/foreging/tffh.py b/foreging/tffh.py new file mode 100644 index 00000000..9ea4b923 --- /dev/null +++ b/foreging/tffh.py @@ -0,0 +1,59 @@ +import csv +import logging +from .models import Format, Registry, RegistryClient + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# +class TFFH(RegistryClient): + source_file = "digipres.github.io/_sources/registries/tffh/tffh.csv" + index_url = "https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/tffh/tffh.csv" + # Set up the Registry object for this class: + registry_id = "tffh" + registry = Registry( + id=registry_id, + name="The File Formats Handbook (1995)", + url="https://github.com/digipres/digipres.github.io/blob/master/_sources/registries/tffh/README.md", + id_prefix=None, + index_data_url=index_url + ) + + def get_formats(self): + fmts = [] + idx = 1 + with open(self.source_file, "r") as f: + reader = csv.DictReader(f) + for row in reader: + idx += 1 + primary_exts = row['Primary File'].strip() + if primary_exts: + # Assemble all entries, skipping empties: + extensions = (primary_exts + " " + row["Secondary Files"].strip()).split(" ") + extensions = [e for e in extensions if e] + # Set up as a format entity: + f = Format( + registry_id=self.registry_id, + id=f"{self.registry_id}:{idx}", + name=row["Name"].strip(), + version=None, + summary=None, + genres=[row["Genre"].strip()], + extensions=extensions, + media_types=[], + has_magic=True, + primary_media_type=None, + parent_media_type=None, + registry_url=None, + registry_source_data_url=None, + registry_index_data_url=f"{self.index_url}#L{idx}", + created=None, + last_modified=None + ) + # And record the entry: + fmts.append(f) + + # Now yield them, so all the log entries get stored too: + for f in fmts: + yield f \ No newline at end of file diff --git a/run-in-datasette.sh b/run-in-datasette.sh new file mode 100755 index 00000000..59b72b1a --- /dev/null +++ b/run-in-datasette.sh @@ -0,0 +1,2 @@ +#!/bin/sh +uvx datasette digipres.github.io/_data/formats/registries.db --config facet_time_limit_ms:1000 --config default_facet_size:15