Skip to content

Commit

Permalink
Add --split/--join for rdf-files to transform subcmd
Browse files Browse the repository at this point in the history
  • Loading branch information
dalito committed Aug 6, 2023
1 parent 09c3c63 commit 0a902d4
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 46 deletions.
28 changes: 24 additions & 4 deletions src/voc4cat/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,29 @@ def add_transform_subparser(subparsers, options):
),
**options,
)
skosopt = parser.add_argument_group("SKOS options")
skosopt_meg = skosopt.add_mutually_exclusive_group()
skosopt_meg.add_argument(
"--split",
help=(
"Convert a single SKOS turtle file to a set of turtle files with "
"one class per file. The diff-optimized long turtle format is used. "
"Combine with --inplace to remove the source file."
),
action="store_true",
)
skosopt_meg.add_argument(
"--join",
help=(
"Join a directory of turtles files representing a split SKOS "
"vocabulary to a single turtle file. Combine with --inplace "
"to remove the source directory and files."
),
action="store_true",
)
xlsxopt = parser.add_argument_group("Excel/xlsx options")
group = xlsxopt.add_mutually_exclusive_group()
group.add_argument(
xlsxopt_meg = xlsxopt.add_mutually_exclusive_group()
xlsxopt_meg.add_argument(
"--make-ids",
help=(
"Acts on xlsx files. Specify the prefix or mapping as prefix:base-IRI and first ID to use. "
Expand All @@ -195,12 +215,12 @@ def add_transform_subparser(subparsers, options):
metavar=("PREFIX-MAPPING", "START-ID"),
type=str,
)
group.add_argument(
xlsxopt_meg.add_argument(
"--from-indent",
help=("Convert concept sheet with indentation to children-URI hierarchy."),
action="store_true",
)
group.add_argument(
xlsxopt_meg.add_argument(
"--to-indent",
help=("Convert concept sheet from children-URI hierarchy to indentation."),
action="store_true",
Expand Down
156 changes: 130 additions & 26 deletions src/voc4cat/transform.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import logging
import shutil
from pathlib import Path
from urllib.parse import urlsplit

from rdflib import Graph
from rdflib.namespace import SKOS

from voc4cat.checks import Voc4catError
from voc4cat.utils import EXCEL_FILE_ENDINGS, RDF_FILE_ENDINGS
Expand All @@ -8,6 +13,58 @@
logger = logging.getLogger(__name__)


def extract_numeric_id_from_iri(iri):
iri_path = urlsplit(iri).path
reverse_id = []
for char in reversed(iri_path): # pragma: no cover
if char.isdigit():
reverse_id.append(char)
elif char == "/":
continue
else:
break
return "".join(reversed(reverse_id))


def write_split_turtle(vocab_graph: Graph, outdir: Path) -> None:
"""
Write each concept, collection and concept scheme to a separate turtle file.
The ids are used as filenames.
"""
outdir.mkdir(exist_ok=True)
query = "SELECT ?iri WHERE {?iri a %s.}"

for skos_class in ["skos:Concept", "skos:Collection", "skos:ConceptScheme"]:
qresults = vocab_graph.query(query % skos_class, initNs={"skos": SKOS})
# Iterate over search results and write each concept, collection and
# concept scheme to a separate turtle file using id as filename.
for qresult in qresults:
iri = qresult["iri"]
tmp_graph = Graph()
tmp_graph += vocab_graph.triples((iri, None, None))
id_part = extract_numeric_id_from_iri(iri)
if skos_class == "skos:ConceptScheme":
outfile = outdir / "concept_scheme.ttl"
else:
outfile = outdir / f"{id_part}.ttl"
tmp_graph.serialize(destination=outfile, format="longturtle")
logger.debug("-> wrote %i %ss-file(s).", len(qresults), skos_class)


def read_split_turtle(vocab_dir: Path) -> Graph:
# Search recursively all turtle files belonging to the concept scheme
turtle_files = vocab_dir.rglob("*.ttl")
# Create an empty RDF graph to hold the concept scheme
cs_graph = Graph()
# Load each turtle file into a separate graph and merge it into the concept scheme graph
for file in turtle_files:
graph = Graph().parse(file, format="turtle")
cs_graph += graph
cs_graph.serialize(destination=vocab_dir.with_suffix(".ttl"), format="turtle")
return cs_graph


def _check_make_ids_args(args):
"""Validate make_ids arguments"""
try:
Expand Down Expand Up @@ -43,45 +100,92 @@ def _check_indent(args):
return separator


def transform(args):
logger.info("Transform subcommand started!")

def _transform_xlsx(file, args):
if args.from_indent or args.to_indent:
separator = _check_indent(args)

if args.make_ids:
prefix, base_iri, start_id = _check_make_ids_args(args)

logger.debug('Processing "%s"', file)

outfile = file if args.outdir is None else args.outdir / file.name
any_action = any(
(args.from_indent, args.to_indent, args.make_ids),
)
if outfile == file and not args.inplace and any_action:
logger.warning(
'This command will overwrite the existing file "%s".'
'Use the flag "--inplace" to enforce replacement or '
'supply an output directory with flag "--outdir".',
file,
)
return
if args.from_indent:
hierarchy_from_indent(file, outfile, separator)
elif args.to_indent:
hierarchy_to_indent(file, outfile, separator)
elif args.make_ids:
make_ids(file, outfile, prefix, start_id, base_iri)
else:
logger.debug("-> nothing to do for xlsx files!")


def _transform_rdf(file, args):
if args.split:
vocab_graph = Graph().parse(str(file), format=RDF_FILE_ENDINGS[file.suffix])
vocab_dir = (
args.outdir / file.with_suffix("").name
if args.outdir
else file.with_suffix("")
)
vocab_dir.mkdir(exist_ok=True)
write_split_turtle(vocab_graph, vocab_dir)
logger.debug("-> wrote split vocabulary to: %s", vocab_dir)
if args.inplace:
logger.debug("-> going to remove %s", file)
file.unlink()
else:
logger.debug("-> nothing to do for rdf files!")


def transform(args):
logger.info("Transform subcommand started!")

files = [args.VOCAB] if args.VOCAB.is_file() else [*Path(args.VOCAB).iterdir()]
xlsx_files = [f for f in files if f.suffix.lower() in EXCEL_FILE_ENDINGS]

rdf_files = [f for f in files if f.suffix.lower() in RDF_FILE_ENDINGS]

if args.VOCAB.is_file() and (len(xlsx_files) + len(rdf_files)) == 0:
logger.warning("Unsupported filetype: %s", args.VOCAB)

if args.join:
rdf_dirs = [d for d in Path(args.VOCAB).iterdir() if any(d.glob("*.ttl"))]
else:
rdf_dirs = []

# transform xlsx files (could be a separate function)
for file in xlsx_files:
logger.debug('Processing "%s"', file)
outfile = file if args.outdir is None else args.outdir / file.name
any_action = any(
(args.from_indent, args.to_indent, args.make_ids),
)
if outfile == file and not args.inplace and any_action:
logger.warning(
'This command will overwrite the existing file "%s".'
'Use the flag "--inplace" to enforce replacement or '
'supply an output directory with flag "--outdir".',
file,
)
return
if args.from_indent:
hierarchy_from_indent(file, outfile, separator)
elif args.to_indent:
hierarchy_to_indent(file, outfile, separator)
elif args.make_ids:
make_ids(file, outfile, prefix, start_id, base_iri)
else:
logger.debug("-> nothing to do for xlsx files!")
_transform_xlsx(file, args)

for file in rdf_files:
logger.debug('Processing "%s"', file)
logger.debug("-> nothing to do for rdf files!")
_transform_rdf(file, args)

for rdf_dir in rdf_dirs:
logger.debug('Processing rdf files in "%s"', rdf_dir)
# The if..else is not required now. It is a frame for future additions.
if args.join:
vocab_graph = read_split_turtle(rdf_dir)
dest = (
(args.outdir / rdf_dir).with_suffix(".ttl").name
if args.outdir
else rdf_dir.with_suffix(".ttl")
)
vocab_graph.serialize(destination=str(dest), format="turtle")
logger.debug("-> joined vocabulary into: %s", dest)
if args.inplace:
logger.debug("-> going to remove %s", rdf_dir)
shutil.rmtree(rdf_dir, ignore_errors=True)
else: # pragma: no cover
logger.debug("-> nothing to do!")
2 changes: 1 addition & 1 deletion src/voc4cat/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
".n3": "n3",
}
KNOWN_FILE_ENDINGS = [str(x) for x in RDF_FILE_ENDINGS] + EXCEL_FILE_ENDINGS
KNOWN_TEMPLATE_VERSIONS = ["0.4.0", "0.4.1", "0.4.2", "0.4.3"]
KNOWN_TEMPLATE_VERSIONS = ["0.4.3"]
LATEST_TEMPLATE = KNOWN_TEMPLATE_VERSIONS[-1]


Expand Down
19 changes: 8 additions & 11 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ def test_nonexisting_file(monkeypatch, datadir, caplog):
assert "File/dir not found: missing.xyz" in caplog.text


def test_exit_errorvalue(monkeypatch, datadir, caplog):
monkeypatch.chdir(datadir)
with caplog.at_level(logging.ERROR), pytest.raises(SystemExit) as exc_info:
run_cli_app(["transform", "missing.xyz"])
assert exc_info.value.code == 1
assert "Terminating with Voc4cat error." in caplog.text


def test_nonexisting_config(monkeypatch, datadir, caplog):
monkeypatch.chdir(datadir)
with caplog.at_level(logging.ERROR), pytest.raises(Voc4catError):
Expand All @@ -84,14 +92,3 @@ def test_invalid_outdir(monkeypatch, datadir, tmp_path, caplog):
):
main_cli(["transform", "--outdir", str(tmp_path / "README.md"), CS_SIMPLE])
assert "Outdir must be a directory but it is a file." in caplog.text


# TODO test logs/loglevel
# @mock.patch.dict(os.environ, {"LOGLEVEL": "DEBUG"})
# def test_set_log_level_by_envvar(datadir, tmp_path):
# dst = tmp_path
# shutil.copy(datadir / CS_SIMPLE, dst)
# outdir = tmp_path / "out"

# main_cli(["check", "--ci-pre", "--outdir", str(outdir), str(dst)])
# assert ret_code is None
57 changes: 57 additions & 0 deletions tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,9 @@ def test_hierarchy_to_indent_merge(monkeypatch, datadir, tmp_path):
)


# ===== Tests for option --outdir =====


@pytest.mark.parametrize(
"outdir",
[None, "out"],
Expand Down Expand Up @@ -525,3 +528,57 @@ def test_outdir_variants(monkeypatch, datadir, tmp_path, outdir):
ws.iter_rows(min_row=3, max_col=2, values_only=True), expected
):
assert row == expected_row


# ===== Tests for options --split / --join =====


@pytest.mark.parametrize(
"opt",
[None, "--inplace"],
ids=["default", "inplace"],
)
def test_split(monkeypatch, datadir, tmp_path, opt, caplog):
shutil.copy(datadir / CS_SIMPLE_TURTLE, tmp_path)
cmd = ["transform", "-v", "--split"]
if opt:
cmd.append("--inplace")
cmd.append(str(tmp_path))
monkeypatch.chdir(tmp_path)

with caplog.at_level(logging.DEBUG):
main_cli(cmd)
assert "-> wrote split vocabulary to" in caplog.text

vocdir = (tmp_path / CS_SIMPLE_TURTLE).with_suffix("")
assert vocdir.is_dir()
assert (vocdir / "concept_scheme.ttl").exists()
assert len([*vocdir.glob("*.ttl")]) == 8 # noqa: PLR2004
if opt:
assert not (tmp_path / CS_SIMPLE_TURTLE).exists()


@pytest.mark.parametrize(
"opt",
[None, "--inplace"],
ids=["default", "inplace"],
)
def test_join(monkeypatch, datadir, tmp_path, opt, caplog):
monkeypatch.chdir(tmp_path)
shutil.copy(datadir / CS_SIMPLE_TURTLE, tmp_path)
# create dir with split files
main_cli(["transform", "-v", "--split", "--inplace", str(tmp_path)])
# join files again as test
cmd = ["transform", "-v", "--join"]
if opt:
cmd.append("--inplace")
cmd.append(str(tmp_path))

with caplog.at_level(logging.DEBUG):
main_cli(cmd)
assert "-> joined vocabulary into" in caplog.text

vocdir = (tmp_path / CS_SIMPLE_TURTLE).with_suffix("")
assert (vocdir.parent / CS_SIMPLE_TURTLE).exists()
if opt:
assert not vocdir.exists()
4 changes: 0 additions & 4 deletions tests/test_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,6 @@ def test_check(datadir, tmp_path, caplog, test_file, err, msg): # noqa: PLR0913
exit_code = main_cli(["--check", "--no-warn", str(dst)])
assert exit_code == err
assert msg in caplog.text
# TODO check that erroneous cells get colored.


def test_check_overwrite_warning(monkeypatch, datadir, tmp_path):
Expand Down Expand Up @@ -627,9 +626,6 @@ def test_no_separator(monkeypatch, datadir):
main_cli(["--indent-separator", "", CS_CYCLES])


# TODO From here on review rest of tests for new cli.


def test_duplicates(datadir, tmp_path, caplog):
"""Check that files do not have the same stem."""
shutil.copy(datadir / CS_CYCLES, tmp_path)
Expand Down

0 comments on commit 0a902d4

Please sign in to comment.