roff-like markup to HTML with additional niceties.
git clone git://git.skec.site/pub/broff.git
log | files | refs | readme | license

commit 1b80b30b4dabc105183bf1e4e024aac8379f3e92
parent 39b651135bb7843c551f84bde3c9a3c0764b152f
Author: Michael Skec
Date:   Wed, 22 Nov 2023 17:44:41 +1100

unicode escape parsing

now parses roff \[uXXXX] unicode escape codes, and reverts previous
commit for non-standard superscripts.

Diffstat:
Mbroff.c | 21++++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/broff.c b/broff.c @@ -144,9 +144,6 @@ print_escaped(const char *line, int l) HANDLE_ESC("\\(aq", "'"); HANDLE_ESC("\\(dq", "\""); - // Superscripts (not actual Roff escapes) - HANDLE_ESC("\\(Sc", "&#x1D9C;"); - // TeX style typographer quotes; must be in this order HANDLE_ESC("``", "&ldquo;"); HANDLE_ESC("''", "&rdquo;"); @@ -159,6 +156,24 @@ print_escaped(const char *line, int l) HANDLE_ESC("&", "&amp;"); HANDLE_ESC("...", "&hellip;"); + // Check for unicode escapes, which can appear like \[uXXXX] + static const char *const UNICODE_ESC_PREFIX = "\\[u"; + if (len_remain > strlen(UNICODE_ESC_PREFIX) && + strncmp(c, UNICODE_ESC_PREFIX, strlen(UNICODE_ESC_PREFIX)) == 0) + { + // Find end of escape (denoted by right-square bracket) + c += strlen(UNICODE_ESC_PREFIX); + const char *c_prev = c; + for (; c < line + l && *c != ']'; ++c); + + // Print the character as HTML escape. + printf("&#x%.*s;", + (int)(c - c_prev), + c_prev); + + continue; + } + if (ispunct(*c) && strchr(",.!?;:'\"", *c) == NULL) { // Escape characters that aren't alphanumeric and are not a