diff --git a/.gitignore b/.gitignore index 9084155..0dd25cf 100644 --- a/.gitignore +++ b/.gitignore @@ -315,4 +315,7 @@ pyrightconfig.json .vscode output tmp -extract \ No newline at end of file +extract + +# File(s) fetched by hatch-openzim plugin +scraper/src/maps2zim/assets/mapbox-gl-rtl-text.js diff --git a/CHANGELOG.md b/CHANGELOG.md index c25d033..94ae24c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Remove pin button with display of coordinates and zoom (#58) - Remove style selector (#57) - Switch to automatically chosen map style based on prefer-colors-scheme (#56) +- Fix support of RTL strings in the map (#54) +- Move from --assets to --dl CLI param for code clarity (#68) +- Fix bad favicon paths (#68) ## [0.1.1] - 2026-03-10 diff --git a/Dockerfile b/Dockerfile index 589001f..36bdd51 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ WORKDIR /output # Copy pyproject.toml and its dependencies COPY README.md /src/ -COPY scraper/pyproject.toml /src/scraper/ +COPY scraper/pyproject.toml scraper/openzim.toml /src/scraper/ COPY scraper/src/maps2zim/__about__.py /src/scraper/src/maps2zim/__about__.py # Install Python dependencies diff --git a/scraper/openzim.toml b/scraper/openzim.toml new file mode 100644 index 0000000..514459a --- /dev/null +++ b/scraper/openzim.toml @@ -0,0 +1,7 @@ +[files.assets.config] +target_dir="src/maps2zim/assets" + +[files.assets.actions."mapbox-gl-rtl-text"] +action="get_file" +source="https://unpkg.com/@mapbox/mapbox-gl-rtl-text@0.4.0/dist/mapbox-gl-rtl-text.js" +target_file="mapbox-gl-rtl-text.js" diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index 04c66ac..3e65b93 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -20,6 +20,8 @@ dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] kind = "scraper" additional-keywords = ["openstreetmap"] +[tool.hatch.build.hooks.openzim-build] + [project.optional-dependencies] scripts = ["invoke==2.2.1"] lint = ["black==26.1.0", "ruff==0.15.0"] diff --git a/scraper/src/maps2zim/context.py b/scraper/src/maps2zim/context.py index 0f2b36c..ae73894 100644 --- a/scraper/src/maps2zim/context.py +++ b/scraper/src/maps2zim/context.py @@ -38,11 +38,11 @@ class Context: # info passed in User-Agent header of web requests contact_info: str = "https://www.kiwix.org" - # temporary folder to store temporary assets (e.g. cached API response) + # temporary folder to store temporary files (e.g. cached API response) tmp_folder: Path - # folder to fetch / store downloaded assets (can be reused across runs) - assets_folder: Path + # folder to fetch / store downloaded files (can be reused across runs) + dl_folder: Path # folder where the ZIM will be built output_folder: Path = Path(os.getenv("MAPS_OUTPUT", "output")) diff --git a/scraper/src/maps2zim/entrypoint.py b/scraper/src/maps2zim/entrypoint.py index d72d3ae..1bc5b6e 100644 --- a/scraper/src/maps2zim/entrypoint.py +++ b/scraper/src/maps2zim/entrypoint.py @@ -135,11 +135,11 @@ def prepare_context(raw_args: list[str], tmpdir: str) -> None: ) parser.add_argument( - "--assets", - help="Folder folder to fetch / store downloaded assets (can be reused across " + "--dl", + help="Folder folder to fetch / store downloaded files (can be reused across " "runs)", type=Path, - dest="assets_folder", + dest="dl_folder", ) parser.add_argument("--debug", help="Enable verbose output", action="store_true") @@ -225,8 +225,8 @@ def prepare_context(raw_args: list[str], tmpdir: str) -> None: else: args_dict["tmp_folder"] = Path(tmpdir) - if not args_dict.get("assets_folder", None): - args_dict["assets_folder"] = args_dict["tmp_folder"] / "assets" + if not args_dict.get("dl_folder", None): + args_dict["dl_folder"] = args_dict["tmp_folder"] / "dl" args_dict["_current_thread_workitem"] = threading.local() args_dict["web_session"] = get_session() diff --git a/scraper/src/maps2zim/processor.py b/scraper/src/maps2zim/processor.py index 518662a..cc5640f 100644 --- a/scraper/src/maps2zim/processor.py +++ b/scraper/src/maps2zim/processor.py @@ -276,6 +276,9 @@ def run_with_creator(self, creator: Creator): logger.info(" Generating about page...") self._write_about_html(creator) + context.current_thread_workitem = "write assets" + self._write_assets(creator) + context.current_thread_workitem = "download fonts" self._fetch_fonts_tar_gz() @@ -427,10 +430,10 @@ def _fetch_favicon_from_illustration(self, illustration: BytesIO) -> BytesIO: def _fetch_fonts_tar_gz(self): """Download fonts tar.gz from OpenFreeMap if not already cached. - If file already exists in assets folder, do nothing. + If file already exists in dl folder, do nothing. Otherwise, download from https://assets.openfreemap.com/fonts/ofm.tar.gz """ - fonts_tar_gz_path = context.assets_folder / "ofm.tar.gz" + fonts_tar_gz_path = context.dl_folder / "ofm.tar.gz" # If file already exists, we're done if fonts_tar_gz_path.exists(): @@ -439,8 +442,8 @@ def _fetch_fonts_tar_gz(self): ) return - # Create assets folder if it doesn't exist - context.assets_folder.mkdir(parents=True, exist_ok=True) + # Create dl folder if it doesn't exist + context.dl_folder.mkdir(parents=True, exist_ok=True) logger.info(" Downloading fonts from OpenFreeMap") save_large_file( @@ -455,7 +458,7 @@ def _write_fonts(self, creator: Creator): Extracts the cached fonts tar.gz file and adds all contents to the ZIM with paths under the 'fonts/' subfolder. """ - fonts_tar_gz_path = context.assets_folder / "ofm.tar.gz" + fonts_tar_gz_path = context.dl_folder / "ofm.tar.gz" logger.info(" Extracting fonts and adding to ZIM") @@ -485,10 +488,10 @@ def _write_fonts(self, creator: Creator): def _fetch_natural_earth_tar_gz(self): """Download natural_earth tar.gz from OpenFreeMap if not already cached. - If file already exists in assets folder, do nothing. + If file already exists in dl folder, do nothing. Otherwise, download from http://assets.openfreemap.com/natural_earth/ofm.tar.gz """ - natural_earth_tar_gz_path = context.assets_folder / "natural_earth.tar.gz" + natural_earth_tar_gz_path = context.dl_folder / "natural_earth.tar.gz" # If file already exists, we're done if natural_earth_tar_gz_path.exists(): @@ -498,8 +501,8 @@ def _fetch_natural_earth_tar_gz(self): ) return - # Create assets folder if it doesn't exist - context.assets_folder.mkdir(parents=True, exist_ok=True) + # Create dl folder if it doesn't exist + context.dl_folder.mkdir(parents=True, exist_ok=True) logger.info(" Downloading natural_earth from OpenFreeMap") save_large_file( @@ -515,7 +518,7 @@ def _write_natural_earth(self, creator: Creator): to the ZIM, transforming paths from ofm/ne2sr/ to natural_earth/ne2sr/. Raises an error if no webp files are found. """ - natural_earth_tar_gz_path = context.assets_folder / "natural_earth.tar.gz" + natural_earth_tar_gz_path = context.dl_folder / "natural_earth.tar.gz" logger.info(" Extracting natural_earth and adding to ZIM") @@ -578,10 +581,10 @@ def _fetch_geonames_zip(self): Downloads from https://download.geonames.org/export/dump/{region}.zip, extracts the TSV file, and removes the ZIP file. - The extracted TSV is cached in the assets folder for processing. + The extracted TSV is cached in the dl folder for processing. """ - geonames_zip_path = context.assets_folder / f"{context.geonames_region}.zip" - geonames_txt_path = context.assets_folder / f"{context.geonames_region}.txt" + geonames_zip_path = context.dl_folder / f"{context.geonames_region}.zip" + geonames_txt_path = context.dl_folder / f"{context.geonames_region}.txt" # If extracted TSV file already exists, we're done if geonames_txt_path.exists(): @@ -591,8 +594,8 @@ def _fetch_geonames_zip(self): ) return - # Create assets folder if it doesn't exist - context.assets_folder.mkdir(parents=True, exist_ok=True) + # Create dl folder if it doesn't exist + context.dl_folder.mkdir(parents=True, exist_ok=True) logger.info( f" Downloading geonames {context.geonames_region} from geonames.org" @@ -615,7 +618,7 @@ def _fetch_geonames_zip(self): f"Could not find {txt_file_name} in geonames ZIP at " f"{geonames_zip_path}" ) - zip_ref.extract(txt_file_name, context.assets_folder) + zip_ref.extract(txt_file_name, context.dl_folder) # Remove ZIP file to save space geonames_zip_path.unlink() @@ -626,10 +629,10 @@ def _fetch_hierarchy_zip(self): Downloads from https://download.geonames.org/export/dump/hierarchy.zip, extracts the hierarchy.txt file, and removes the ZIP file. - The extracted TSV is cached in the assets folder for processing. + The extracted TSV is cached in the dl folder for processing. """ - hierarchy_zip_path = context.assets_folder / "hierarchy.zip" - hierarchy_txt_path = context.assets_folder / "hierarchy.txt" + hierarchy_zip_path = context.dl_folder / "hierarchy.zip" + hierarchy_txt_path = context.dl_folder / "hierarchy.txt" # If extracted TSV file already exists, we're done if hierarchy_txt_path.exists(): @@ -638,8 +641,8 @@ def _fetch_hierarchy_zip(self): ) return - # Create assets folder if it doesn't exist - context.assets_folder.mkdir(parents=True, exist_ok=True) + # Create dl folder if it doesn't exist + context.dl_folder.mkdir(parents=True, exist_ok=True) logger.info(" Downloading hierarchy from geonames.org") hierarchy_url = "https://download.geonames.org/export/dump/hierarchy.zip" @@ -653,7 +656,7 @@ def _fetch_hierarchy_zip(self): raise OSError( f"Could not find hierarchy.txt in ZIP at {hierarchy_zip_path}" ) - zip_ref.extract("hierarchy.txt", context.assets_folder) + zip_ref.extract("hierarchy.txt", context.dl_folder) # Remove ZIP file to save space hierarchy_zip_path.unlink() @@ -665,9 +668,9 @@ def _fetch_country_info(self): """Download country info TSV from geonames if not already cached. Downloads from https://download.geonames.org/export/dump/countryInfo.txt - and caches it in the assets folder. + and caches it in the dl folder. """ - country_info_path = context.assets_folder / "countryInfo.txt" + country_info_path = context.dl_folder / "countryInfo.txt" # If file already exists, we're done if country_info_path.exists(): @@ -676,8 +679,8 @@ def _fetch_country_info(self): ) return - # Create assets folder if it doesn't exist - context.assets_folder.mkdir(parents=True, exist_ok=True) + # Create dl folder if it doesn't exist + context.dl_folder.mkdir(parents=True, exist_ok=True) logger.info(" Downloading country info from geonames.org") save_large_file( @@ -696,7 +699,7 @@ def _parse_country_info() -> dict[str, str]: Returns: Dictionary mapping ISO code to country name. """ - country_info_path = context.assets_folder / "countryInfo.txt" + country_info_path = context.dl_folder / "countryInfo.txt" if not country_info_path.exists(): logger.info(" Country info not available, skipping country name lookup") @@ -733,10 +736,10 @@ def _parse_country_info() -> dict[str, str]: def _fetch_sprites_tar_gz(self): """Download sprites tar.gz from OpenFreeMap if not already cached. - If file already exists in assets folder, do nothing. + If file already exists in dl folder, do nothing. Otherwise, download from https://assets.openfreemap.com/sprites/ofm_f384.tar.gz """ - sprites_tar_gz_path = context.assets_folder / "sprites.tar.gz" + sprites_tar_gz_path = context.dl_folder / "sprites.tar.gz" # If file already exists, we're done if sprites_tar_gz_path.exists(): @@ -745,8 +748,8 @@ def _fetch_sprites_tar_gz(self): ) return - # Create assets folder if it doesn't exist - context.assets_folder.mkdir(parents=True, exist_ok=True) + # Create dl folder if it doesn't exist + context.dl_folder.mkdir(parents=True, exist_ok=True) logger.info(" Downloading sprites from OpenFreeMap") save_large_file( @@ -761,7 +764,7 @@ def _write_sprites(self, creator: Creator): Extracts the cached sprites tar.gz file and adds all contents to the ZIM, transforming paths from ofm_f384/ to sprites/ofm_f384/. """ - sprites_tar_gz_path = context.assets_folder / "sprites.tar.gz" + sprites_tar_gz_path = context.dl_folder / "sprites.tar.gz" logger.info(" Extracting sprites and adding to ZIM") @@ -794,7 +797,7 @@ def _write_styles(self, creator: Creator): logger.info(" Cleaning styles and adding to ZIM") # Extract and add styles to ZIM - for style in Path(str(assets)).glob("kiwix-*.json"): + for style in assets.glob("kiwix-*.json"): # Parse JSON style_obj = json.loads(style.read_bytes()) @@ -816,8 +819,9 @@ def _write_styles(self, creator: Creator): # Add to ZIM creator.add_item_for( - path=f"styles/{str(style.relative_to(assets))[:-5]}", + path=f"assets/{str(style.relative_to(assets))[:-5]}", content=content, + mimetype="application/json", ) logger.info(" Styles added to ZIM") @@ -828,7 +832,7 @@ def _get_available_layers_from_mbtiles(self) -> set[str]: Reads the mbtiles database and extracts the list of available layers from the vector_layers metadata. """ - mbtiles_path = context.assets_folder / f"{context.area}.mbtiles" + mbtiles_path = context.dl_folder / f"{context.area}.mbtiles" # If mbtiles doesn't exist yet, return empty set if not mbtiles_path.exists(): @@ -884,7 +888,7 @@ def _count_mbtiles_items(self) -> tuple[int, int]: Returns: Tuple of (dedupl_count, tile_count) """ - mbtiles_path = context.assets_folder / f"{context.area}.mbtiles" + mbtiles_path = context.dl_folder / f"{context.area}.mbtiles" conn = sqlite3.connect(mbtiles_path) c = conn.cursor() @@ -916,7 +920,7 @@ def _write_tiles_to_zim( """ logger.info(" Processing tiles and dedup files") - mbtiles_path = context.assets_folder / f"{context.area}.mbtiles" + mbtiles_path = context.dl_folder / f"{context.area}.mbtiles" conn = sqlite3.connect(mbtiles_path) c = conn.cursor() @@ -1010,7 +1014,7 @@ def _get_mbtiles_maxzoom(self) -> int: Returns: Maximum zoom level (default 14 if not found) """ - mbtiles_path = context.assets_folder / f"{context.area}.mbtiles" + mbtiles_path = context.dl_folder / f"{context.area}.mbtiles" if not mbtiles_path.exists(): return 14 # Default if file doesn't exist yet @@ -1025,7 +1029,7 @@ def _get_mbtiles_maxzoom(self) -> int: conn.close() def _fetch_mbtiles(self): - """Ensure mbtiles file is available in assets folder + """Ensure mbtiles file is available in dl folder If file is already there, do nothing. @@ -1036,7 +1040,7 @@ def _fetch_mbtiles(self): # Determine the mbtiles filename based on area mbtiles_filename = f"{context.area}.mbtiles" - mbtiles_path = context.assets_folder / mbtiles_filename + mbtiles_path = context.dl_folder / mbtiles_filename # If file already exists, we're done if mbtiles_path.exists(): @@ -1044,7 +1048,7 @@ def _fetch_mbtiles(self): return # Create assets folder if it doesn't exist - context.assets_folder.mkdir(parents=True, exist_ok=True) + context.dl_folder.mkdir(parents=True, exist_ok=True) logger.info(f" Fetching mbtiles file for area: {context.area}") @@ -1120,7 +1124,7 @@ def _write_tilejson(self, creator: Creator): Reads metadata from the mbtiles database and generates a TileJSON file that describes the tileset for use by the web UI. """ - mbtiles_path = context.assets_folder / f"{context.area}.mbtiles" + mbtiles_path = context.dl_folder / f"{context.area}.mbtiles" conn = sqlite3.connect(mbtiles_path) c = conn.cursor() @@ -1211,7 +1215,7 @@ def _parse_geonames( Dictionary mapping place names to lists of SearchPlace objects. Returns empty dict if data file is not available. """ - geonames_txt_path = context.assets_folder / f"{context.geonames_region}.txt" + geonames_txt_path = context.dl_folder / f"{context.geonames_region}.txt" if not geonames_txt_path.exists(): logger.info(" Geonames data not available, skipping") @@ -1307,7 +1311,7 @@ def _parse_hierarchy() -> dict[str, str]: Returns: Dictionary mapping child_id to parent_id. """ - hierarchy_txt_path = context.assets_folder / "hierarchy.txt" + hierarchy_txt_path = context.dl_folder / "hierarchy.txt" if not hierarchy_txt_path.exists(): logger.info(" Hierarchy data not available, skipping hierarchical labels") @@ -1419,7 +1423,6 @@ def _write_places( Takes a dictionary of places grouped by name and creates: - Redirect HTML for unique place names - Disambiguation HTML for duplicate names - - CSS file for styling (styles.css) Args: creator: ZIM creator object @@ -1429,14 +1432,6 @@ def _write_places( logger.info(" No places to write, skipping") return - # Add CSS file to ZIM - styles_path = assets / "styles.css" - creator.add_item_for( - path="content/styles.css", - fpath=styles_path, - mimetype="text/css", - ) - # Setup progress tracking total_places = len(places_dict) self.stats_items_total += total_places @@ -1502,9 +1497,10 @@ def _create_redirect_html(place: SearchPlace, root_prefix: str) -> str: