From 519994c3077f9e0e0d1f2f9a9723dca470346ab4 Mon Sep 17 00:00:00 2001
From: nntrn <17685332+nntrn@users.noreply.github.com>
Date: Mon, 30 Sep 2024 04:53:20 -0500
Subject: [PATCH] Condense regex

---
 scripts/annotations.jq |  2 +-
 scripts/books.jq       | 31 ++++++++++++++++---------------
 2 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/scripts/annotations.jq b/scripts/annotations.jq
index 0ae2aa2..49d65e5 100644
--- a/scripts/annotations.jq
+++ b/scripts/annotations.jq
@@ -34,7 +34,7 @@ def create_activity_data:
       created: .ZANNOTATIONCREATIONDATE,
       location: .ZANNOTATIONLOCATION,
       cfi: (.ZANNOTATIONLOCATION|[match("\\b[0-9]{1,4}\\b";"g").string|tonumber]|join(".")),
-      chapter: (if ((.ZFUTUREPROOFING5|length)>0) then .ZFUTUREPROOFING5 else (.ZANNOTATIONLOCATION|format_location) end),
+      chapter: (if ((.ZFUTUREPROOFING5|length)>0) then .ZFUTUREPROOFING5 else (.ZANNOTATIONLOCATION|format_location)? // .rangestart end),
       rangestart: .ZPLLOCATIONRANGESTART
   }) | sort_by(.id);
 
diff --git a/scripts/books.jq b/scripts/books.jq
index 5a304fa..1b71a2f 100644
--- a/scripts/books.jq
+++ b/scripts/books.jq
@@ -74,32 +74,33 @@ def identify_subsection:
   elif test("[Pp][rologue]{2,}") then "Prologue"
   elif test("[gG][losary]{2,}") then "Glossary"
   elif test("[aA][fterword]{3,}") then "Afterword"
-  elif test("[aA][pendix]{3,}";"i") then "APPENDIX \($string|gsub("^[0-9]";""))"
+  elif test("[aA][pendix]{3,}";"i") then "Appendix"
   elif test("[fF][ront]{2,}") then "Front"
   elif test("^[A-Z0-9 ]+$") then ($string |gsub("[^0-9]";"")|tonumber|tostring)
   else  null
   end
   ;
 
+def getnum($str): $str
+  | gsub("(?<w>[^0-9])(?<n>[0-9]+)";.w + " " + .n)
+  | [match("(\\b[0-9]+)";"g").string|tonumber]|last|tostring;
+
 def format_chapter:
-  ([.]|flatten ) as $input
-  | [ ($input|flatten| map(select(test("([cC]|[pP]|[sS])[\\w]*?[0-9]+"))) ),$input]
-  | flatten(2)
-  | first
-  | gsub("(epub|EPUB|xhtml|html|ji[0-9]+|sup[0-9]+|nav)[ ]?"; "")
-  | gsub("[cC][hapter ]{2,}(?<n>[0-9]+)";  "Chapter " + (.n|tonumber|tostring)? // .n; "x")
-  | gsub( "[sS][ection ]{2,}(?<n>[0-9]+)"; "Section " + (.n|tonumber|tostring)? // .n;  "x")
-  | gsub( "[pP][art ]{1,}(?<n>[0-9]+)";  "Part " + (.n|tonumber|tostring)? // .n;   "x")
-  | gsub( "^[cC][hapter]*?(?<n>[0-9]+)";  "Chapter " + (.n|tonumber|tostring)? // .n;   "x")
-  | gsub( "^[pP](?<n>[ 0-9]+)";  "Page " + (.n|tonumber|tostring)? // .n;   "x")
-  | gsub("(?<w>[a-zA-Z]{3,})(?<d>[0-9])"; .w + " " + (.d|tonumber|tostring)? // .n  ;"x")
-  | gsub(".*[0-9]{6,}.*(?<n>[0-9]+)"; "x" + .n; "x")
+  ([.]|flatten|unique) as $input
+  | ($input | join(" ")
+  | gsub("x?[0-9]{8,}"; .n; "x")) as $str
+  | $str | 
+  if test("[cC][chapter]{1,}") then  "Chapter " + getnum($str) 
+  elif test("toc_marker") then getnum($str)
+  else 
+    $str 
+    | gsub("(epub|EPUB|[[:punct:]]xhtml|html|ji[0-9]+|sup[0-9]+|nav_|div[0-9]+)"; "")
+    | gsub("^[^a-zA-Z0-9]+";"")
+  end
   ;
 
 def format_location:
   [match("\\[([^\\]]+)\\]+";"g").captures[].string] as $raw 
   | ($raw|join(" ")|identify_subsection) as $sec
   | (if $sec then $sec  else ($raw|format_chapter) end)
-  | gsub("[_-]+";" ";"x")
-  | gsub("^[\\s]+|[\\s]+$";"")
   ;