From 519994c3077f9e0e0d1f2f9a9723dca470346ab4 Mon Sep 17 00:00:00 2001 From: nntrn <17685332+nntrn@users.noreply.github.com> Date: Mon, 30 Sep 2024 04:53:20 -0500 Subject: [PATCH] Condense regex --- scripts/annotations.jq | 2 +- scripts/books.jq | 31 ++++++++++++++++--------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/scripts/annotations.jq b/scripts/annotations.jq index 0ae2aa2..49d65e5 100644 --- a/scripts/annotations.jq +++ b/scripts/annotations.jq @@ -34,7 +34,7 @@ def create_activity_data: created: .ZANNOTATIONCREATIONDATE, location: .ZANNOTATIONLOCATION, cfi: (.ZANNOTATIONLOCATION|[match("\\b[0-9]{1,4}\\b";"g").string|tonumber]|join(".")), - chapter: (if ((.ZFUTUREPROOFING5|length)>0) then .ZFUTUREPROOFING5 else (.ZANNOTATIONLOCATION|format_location) end), + chapter: (if ((.ZFUTUREPROOFING5|length)>0) then .ZFUTUREPROOFING5 else (.ZANNOTATIONLOCATION|format_location)? // .rangestart end), rangestart: .ZPLLOCATIONRANGESTART }) | sort_by(.id); diff --git a/scripts/books.jq b/scripts/books.jq index 5a304fa..1b71a2f 100644 --- a/scripts/books.jq +++ b/scripts/books.jq @@ -74,32 +74,33 @@ def identify_subsection: elif test("[Pp][rologue]{2,}") then "Prologue" elif test("[gG][losary]{2,}") then "Glossary" elif test("[aA][fterword]{3,}") then "Afterword" - elif test("[aA][pendix]{3,}";"i") then "APPENDIX \($string|gsub("^[0-9]";""))" + elif test("[aA][pendix]{3,}";"i") then "Appendix" elif test("[fF][ront]{2,}") then "Front" elif test("^[A-Z0-9 ]+$") then ($string |gsub("[^0-9]";"")|tonumber|tostring) else null end ; +def getnum($str): $str + | gsub("(?[^0-9])(?[0-9]+)";.w + " " + .n) + | [match("(\\b[0-9]+)";"g").string|tonumber]|last|tostring; + def format_chapter: - ([.]|flatten ) as $input - | [ ($input|flatten| map(select(test("([cC]|[pP]|[sS])[\\w]*?[0-9]+"))) ),$input] - | flatten(2) - | first - | gsub("(epub|EPUB|xhtml|html|ji[0-9]+|sup[0-9]+|nav)[ ]?"; "") - | gsub("[cC][hapter ]{2,}(?[0-9]+)"; "Chapter " + (.n|tonumber|tostring)? // .n; "x") - | gsub( "[sS][ection ]{2,}(?[0-9]+)"; "Section " + (.n|tonumber|tostring)? // .n; "x") - | gsub( "[pP][art ]{1,}(?[0-9]+)"; "Part " + (.n|tonumber|tostring)? // .n; "x") - | gsub( "^[cC][hapter]*?(?[0-9]+)"; "Chapter " + (.n|tonumber|tostring)? // .n; "x") - | gsub( "^[pP](?[ 0-9]+)"; "Page " + (.n|tonumber|tostring)? // .n; "x") - | gsub("(?[a-zA-Z]{3,})(?[0-9])"; .w + " " + (.d|tonumber|tostring)? // .n ;"x") - | gsub(".*[0-9]{6,}.*(?[0-9]+)"; "x" + .n; "x") + ([.]|flatten|unique) as $input + | ($input | join(" ") + | gsub("x?[0-9]{8,}"; .n; "x")) as $str + | $str | + if test("[cC][chapter]{1,}") then "Chapter " + getnum($str) + elif test("toc_marker") then getnum($str) + else + $str + | gsub("(epub|EPUB|[[:punct:]]xhtml|html|ji[0-9]+|sup[0-9]+|nav_|div[0-9]+)"; "") + | gsub("^[^a-zA-Z0-9]+";"") + end ; def format_location: [match("\\[([^\\]]+)\\]+";"g").captures[].string] as $raw | ($raw|join(" ")|identify_subsection) as $sec | (if $sec then $sec else ($raw|format_chapter) end) - | gsub("[_-]+";" ";"x") - | gsub("^[\\s]+|[\\s]+$";"") ;