Add --split/--join for rdf-files to transform subcmd

nfdi4cat · Aug 6, 2023 · 0a902d4 · 0a902d4
1 parent 09c3c63
commit 0a902d4
Show file tree

Hide file tree

Showing 6 changed files with 220 additions and 46 deletions.
diff --git a/src/voc4cat/cli.py b/src/voc4cat/cli.py
@@ -178,9 +178,29 @@ def add_transform_subparser(subparsers, options):
         ),
         **options,
     )
+    skosopt = parser.add_argument_group("SKOS options")
+    skosopt_meg = skosopt.add_mutually_exclusive_group()
+    skosopt_meg.add_argument(
+        "--split",
+        help=(
+            "Convert a single SKOS turtle file to a set of turtle files with "
+            "one class per file. The diff-optimized long turtle format is used. "
+            "Combine with --inplace to remove the source file."
+        ),
+        action="store_true",
+    )
+    skosopt_meg.add_argument(
+        "--join",
+        help=(
+            "Join a directory of turtles files representing a split SKOS "
+            "vocabulary to a single turtle file. Combine with --inplace "
+            "to remove the source directory and files."
+        ),
+        action="store_true",
+    )
     xlsxopt = parser.add_argument_group("Excel/xlsx options")
-    group = xlsxopt.add_mutually_exclusive_group()
-    group.add_argument(
+    xlsxopt_meg = xlsxopt.add_mutually_exclusive_group()
+    xlsxopt_meg.add_argument(
         "--make-ids",
         help=(
             "Acts on xlsx files. Specify the prefix or mapping as prefix:base-IRI and first ID to use. "
@@ -195,12 +215,12 @@ def add_transform_subparser(subparsers, options):
         metavar=("PREFIX-MAPPING", "START-ID"),
         type=str,
     )
-    group.add_argument(
+    xlsxopt_meg.add_argument(
         "--from-indent",
         help=("Convert concept sheet with indentation to children-URI hierarchy."),
         action="store_true",
     )
-    group.add_argument(
+    xlsxopt_meg.add_argument(
         "--to-indent",
         help=("Convert concept sheet from children-URI hierarchy to indentation."),
         action="store_true",

diff --git a/src/voc4cat/transform.py b/src/voc4cat/transform.py
@@ -1,5 +1,10 @@
 import logging
+import shutil
 from pathlib import Path
+from urllib.parse import urlsplit
+
+from rdflib import Graph
+from rdflib.namespace import SKOS
 
 from voc4cat.checks import Voc4catError
 from voc4cat.utils import EXCEL_FILE_ENDINGS, RDF_FILE_ENDINGS
@@ -8,6 +13,58 @@
 logger = logging.getLogger(__name__)
 
 
+def extract_numeric_id_from_iri(iri):
+    iri_path = urlsplit(iri).path
+    reverse_id = []
+    for char in reversed(iri_path):  # pragma: no cover
+        if char.isdigit():
+            reverse_id.append(char)
+        elif char == "/":
+            continue
+        else:
+            break
+    return "".join(reversed(reverse_id))
+
+
+def write_split_turtle(vocab_graph: Graph, outdir: Path) -> None:
+    """
+    Write each concept, collection and concept scheme to a separate turtle file.
+
+    The ids are used as filenames.
+    """
+    outdir.mkdir(exist_ok=True)
+    query = "SELECT ?iri WHERE {?iri a %s.}"
+
+    for skos_class in ["skos:Concept", "skos:Collection", "skos:ConceptScheme"]:
+        qresults = vocab_graph.query(query % skos_class, initNs={"skos": SKOS})
+        # Iterate over search results and write each concept, collection and
+        # concept scheme to a separate turtle file using id as filename.
+        for qresult in qresults:
+            iri = qresult["iri"]
+            tmp_graph = Graph()
+            tmp_graph += vocab_graph.triples((iri, None, None))
+            id_part = extract_numeric_id_from_iri(iri)
+            if skos_class == "skos:ConceptScheme":
+                outfile = outdir / "concept_scheme.ttl"
+            else:
+                outfile = outdir / f"{id_part}.ttl"
+            tmp_graph.serialize(destination=outfile, format="longturtle")
+        logger.debug("-> wrote %i %ss-file(s).", len(qresults), skos_class)
+
+
+def read_split_turtle(vocab_dir: Path) -> Graph:
+    # Search recursively all turtle files belonging to the concept scheme
+    turtle_files = vocab_dir.rglob("*.ttl")
+    # Create an empty RDF graph to hold the concept scheme
+    cs_graph = Graph()
+    # Load each turtle file into a separate graph and merge it into the concept scheme graph
+    for file in turtle_files:
+        graph = Graph().parse(file, format="turtle")
+        cs_graph += graph
+    cs_graph.serialize(destination=vocab_dir.with_suffix(".ttl"), format="turtle")
+    return cs_graph
+
+
 def _check_make_ids_args(args):
     """Validate make_ids arguments"""
     try:
@@ -43,45 +100,92 @@ def _check_indent(args):
     return separator
 
 
-def transform(args):
-    logger.info("Transform subcommand started!")
-
+def _transform_xlsx(file, args):
     if args.from_indent or args.to_indent:
         separator = _check_indent(args)
-
     if args.make_ids:
         prefix, base_iri, start_id = _check_make_ids_args(args)
 
+    logger.debug('Processing "%s"', file)
+
+    outfile = file if args.outdir is None else args.outdir / file.name
+    any_action = any(
+        (args.from_indent, args.to_indent, args.make_ids),
+    )
+    if outfile == file and not args.inplace and any_action:
+        logger.warning(
+            'This command will overwrite the existing file "%s".'
+            'Use the flag "--inplace" to enforce replacement or '
+            'supply an output directory with flag "--outdir".',
+            file,
+        )
+        return
+    if args.from_indent:
+        hierarchy_from_indent(file, outfile, separator)
+    elif args.to_indent:
+        hierarchy_to_indent(file, outfile, separator)
+    elif args.make_ids:
+        make_ids(file, outfile, prefix, start_id, base_iri)
+    else:
+        logger.debug("-> nothing to do for xlsx files!")
+
+
+def _transform_rdf(file, args):
+    if args.split:
+        vocab_graph = Graph().parse(str(file), format=RDF_FILE_ENDINGS[file.suffix])
+        vocab_dir = (
+            args.outdir / file.with_suffix("").name
+            if args.outdir
+            else file.with_suffix("")
+        )
+        vocab_dir.mkdir(exist_ok=True)
+        write_split_turtle(vocab_graph, vocab_dir)
+        logger.debug("-> wrote split vocabulary to: %s", vocab_dir)
+        if args.inplace:
+            logger.debug("-> going to remove %s", file)
+            file.unlink()
+    else:
+        logger.debug("-> nothing to do for rdf files!")
+
+
+def transform(args):
+    logger.info("Transform subcommand started!")
+
     files = [args.VOCAB] if args.VOCAB.is_file() else [*Path(args.VOCAB).iterdir()]
     xlsx_files = [f for f in files if f.suffix.lower() in EXCEL_FILE_ENDINGS]
+
     rdf_files = [f for f in files if f.suffix.lower() in RDF_FILE_ENDINGS]
+
     if args.VOCAB.is_file() and (len(xlsx_files) + len(rdf_files)) == 0:
         logger.warning("Unsupported filetype: %s", args.VOCAB)
 
+    if args.join:
+        rdf_dirs = [d for d in Path(args.VOCAB).iterdir() if any(d.glob("*.ttl"))]
+    else:
+        rdf_dirs = []
+
     # transform xlsx files (could be a separate function)
     for file in xlsx_files:
-        logger.debug('Processing "%s"', file)
-        outfile = file if args.outdir is None else args.outdir / file.name
-        any_action = any(
-            (args.from_indent, args.to_indent, args.make_ids),
-        )
-        if outfile == file and not args.inplace and any_action:
-            logger.warning(
-                'This command will overwrite the existing file "%s".'
-                'Use the flag "--inplace" to enforce replacement or '
-                'supply an output directory with flag "--outdir".',
-                file,
-            )
-            return
-        if args.from_indent:
-            hierarchy_from_indent(file, outfile, separator)
-        elif args.to_indent:
-            hierarchy_to_indent(file, outfile, separator)
-        elif args.make_ids:
-            make_ids(file, outfile, prefix, start_id, base_iri)
-        else:
-            logger.debug("-> nothing to do for xlsx files!")
+        _transform_xlsx(file, args)
 
     for file in rdf_files:
         logger.debug('Processing "%s"', file)
-        logger.debug("-> nothing to do for rdf files!")
+        _transform_rdf(file, args)
+
+    for rdf_dir in rdf_dirs:
+        logger.debug('Processing rdf files in "%s"', rdf_dir)
+        # The if..else is not required now. It is a frame for future additions.
+        if args.join:
+            vocab_graph = read_split_turtle(rdf_dir)
+            dest = (
+                (args.outdir / rdf_dir).with_suffix(".ttl").name
+                if args.outdir
+                else rdf_dir.with_suffix(".ttl")
+            )
+            vocab_graph.serialize(destination=str(dest), format="turtle")
+            logger.debug("-> joined vocabulary into: %s", dest)
+            if args.inplace:
+                logger.debug("-> going to remove %s", rdf_dir)
+                shutil.rmtree(rdf_dir, ignore_errors=True)
+        else:  # pragma: no cover
+            logger.debug("-> nothing to do!")
diff --git a/src/voc4cat/utils.py b/src/voc4cat/utils.py
@@ -21,7 +21,7 @@
     ".n3": "n3",
 }
 KNOWN_FILE_ENDINGS = [str(x) for x in RDF_FILE_ENDINGS] + EXCEL_FILE_ENDINGS
-KNOWN_TEMPLATE_VERSIONS = ["0.4.0", "0.4.1", "0.4.2", "0.4.3"]
+KNOWN_TEMPLATE_VERSIONS = ["0.4.3"]
 LATEST_TEMPLATE = KNOWN_TEMPLATE_VERSIONS[-1]
 
 

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -59,6 +59,14 @@ def test_nonexisting_file(monkeypatch, datadir, caplog):
     assert "File/dir not found: missing.xyz" in caplog.text
 
 
+def test_exit_errorvalue(monkeypatch, datadir, caplog):
+    monkeypatch.chdir(datadir)
+    with caplog.at_level(logging.ERROR), pytest.raises(SystemExit) as exc_info:
+        run_cli_app(["transform", "missing.xyz"])
+    assert exc_info.value.code == 1
+    assert "Terminating with Voc4cat error." in caplog.text
+
+
 def test_nonexisting_config(monkeypatch, datadir, caplog):
     monkeypatch.chdir(datadir)
     with caplog.at_level(logging.ERROR), pytest.raises(Voc4catError):
@@ -84,14 +92,3 @@ def test_invalid_outdir(monkeypatch, datadir, tmp_path, caplog):
     ):
         main_cli(["transform", "--outdir", str(tmp_path / "README.md"), CS_SIMPLE])
     assert "Outdir must be a directory but it is a file." in caplog.text
-
-
-# TODO test logs/loglevel
-# @mock.patch.dict(os.environ, {"LOGLEVEL": "DEBUG"})
-# def test_set_log_level_by_envvar(datadir, tmp_path):
-#     dst = tmp_path
-#     shutil.copy(datadir / CS_SIMPLE, dst)
-#     outdir = tmp_path / "out"
-
-#     main_cli(["check", "--ci-pre", "--outdir", str(outdir), str(dst)])
-#     assert ret_code is None
diff --git a/tests/test_transform.py b/tests/test_transform.py
@@ -492,6 +492,9 @@ def test_hierarchy_to_indent_merge(monkeypatch, datadir, tmp_path):
         )
 
 
+# ===== Tests for option --outdir =====
+
+
 @pytest.mark.parametrize(
     "outdir",
     [None, "out"],
@@ -525,3 +528,57 @@ def test_outdir_variants(monkeypatch, datadir, tmp_path, outdir):
         ws.iter_rows(min_row=3, max_col=2, values_only=True), expected
     ):
         assert row == expected_row
+
+
+# ===== Tests for options --split / --join =====
+
+
+@pytest.mark.parametrize(
+    "opt",
+    [None, "--inplace"],
+    ids=["default", "inplace"],
+)
+def test_split(monkeypatch, datadir, tmp_path, opt, caplog):
+    shutil.copy(datadir / CS_SIMPLE_TURTLE, tmp_path)
+    cmd = ["transform", "-v", "--split"]
+    if opt:
+        cmd.append("--inplace")
+    cmd.append(str(tmp_path))
+    monkeypatch.chdir(tmp_path)
+
+    with caplog.at_level(logging.DEBUG):
+        main_cli(cmd)
+    assert "-> wrote split vocabulary to" in caplog.text
+
+    vocdir = (tmp_path / CS_SIMPLE_TURTLE).with_suffix("")
+    assert vocdir.is_dir()
+    assert (vocdir / "concept_scheme.ttl").exists()
+    assert len([*vocdir.glob("*.ttl")]) == 8  # noqa: PLR2004
+    if opt:
+        assert not (tmp_path / CS_SIMPLE_TURTLE).exists()
+
+
+@pytest.mark.parametrize(
+    "opt",
+    [None, "--inplace"],
+    ids=["default", "inplace"],
+)
+def test_join(monkeypatch, datadir, tmp_path, opt, caplog):
+    monkeypatch.chdir(tmp_path)
+    shutil.copy(datadir / CS_SIMPLE_TURTLE, tmp_path)
+    # create dir with split files
+    main_cli(["transform", "-v", "--split", "--inplace", str(tmp_path)])
+    # join files again as test
+    cmd = ["transform", "-v", "--join"]
+    if opt:
+        cmd.append("--inplace")
+    cmd.append(str(tmp_path))
+
+    with caplog.at_level(logging.DEBUG):
+        main_cli(cmd)
+    assert "-> joined vocabulary into" in caplog.text
+
+    vocdir = (tmp_path / CS_SIMPLE_TURTLE).with_suffix("")
+    assert (vocdir.parent / CS_SIMPLE_TURTLE).exists()
+    if opt:
+        assert not vocdir.exists()
diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py
@@ -491,7 +491,6 @@ def test_check(datadir, tmp_path, caplog, test_file, err, msg):  # noqa: PLR0913
         exit_code = main_cli(["--check", "--no-warn", str(dst)])
     assert exit_code == err
     assert msg in caplog.text
-    # TODO check that erroneous cells get colored.
 
 
 def test_check_overwrite_warning(monkeypatch, datadir, tmp_path):
@@ -627,9 +626,6 @@ def test_no_separator(monkeypatch, datadir):
         main_cli(["--indent-separator", "", CS_CYCLES])
 
 
-# TODO From here on review rest of tests for new cli.
-
-
 def test_duplicates(datadir, tmp_path, caplog):
     """Check that files do not have the same stem."""
     shutil.copy(datadir / CS_CYCLES, tmp_path)