From mboxrd@z Thu Jan 1 00:00:00 1970 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on inbox.vuxu.org X-Spam-Level: X-Spam-Status: No, score=0.0 required=5.0 tests=RCVD_IN_DNSWL_NONE autolearn=ham autolearn_force=no version=3.4.4 Received: (qmail 32125 invoked from network); 31 Dec 2020 09:43:12 -0000 Received: from ewsd.inri.net (107.191.116.128) by inbox.vuxu.org with ESMTPUTF8; 31 Dec 2020 09:43:12 -0000 Received: from asquith.prosimetrum.com ([125.236.209.157]) by ewsd; Thu Dec 31 04:41:48 -0500 2020 Message-ID: <6980D1E3953AA95A3C3A4D2DC9967FEA@prosimetrum.com> Date: Thu, 31 Dec 2020 22:42:46 +1300 From: umbraticus@prosimetrum.com To: 9front@9front.org In-Reply-To: MIME-Version: 1.0 Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 8bit List-ID: <9front.9front.org> List-Help: X-Glyph: ➈ X-Bullshit: mobile stable package component Subject: Re: [9front] htmlfmt anchor corner cases Reply-To: 9front@9front.org Precedence: bulk This patch makes the following changes to htmlfmt: • Print image src like {url} instead of [image url] • Properly combine rooted paths with base url • Handle “protocol relative” urls • Respect tag • Print document title at top • Implement footnote mode -f • Remove unused crap umbraticus diff -r b24b6b01d46a sys/src/cmd/htmlfmt/dat.h --- a/sys/src/cmd/htmlfmt/dat.h Tue Dec 29 19:38:59 2020 +0000 +++ b/sys/src/cmd/htmlfmt/dat.h Thu Dec 31 16:08:26 2020 +1300 @@ -3,6 +3,7 @@ enum { + NONE, INLINE, FOOTNOTES, STACK = 8192, EVENTSIZE = 256, }; @@ -20,29 +21,15 @@ int outfd; int type; - char *url; Item *items; Docinfo *docinfo; }; -extern char* url; -extern int aflag; +extern Rune* baseurl; +extern int links; extern int width; extern char* loadhtml(int); - -extern char* readfile(char*, char*, int*); -extern void* emalloc(ulong); -extern char* estrdup(char*); -extern char* estrstrdup(char*, char*); -extern char* egrow(char*, char*, char*); -extern char* eappend(char*, char*, char*); -extern void error(char*, ...); - extern void growbytes(Bytes*, char*, long); - -extern void rendertext(URLwin*, Bytes*); extern void rerender(URLwin*); extern void freeurlwin(URLwin*); - -#pragma varargck argpos error 1 diff -r b24b6b01d46a sys/src/cmd/htmlfmt/html.c --- a/sys/src/cmd/htmlfmt/html.c Tue Dec 29 19:38:59 2020 +0000 +++ b/sys/src/cmd/htmlfmt/html.c Thu Dec 31 16:08:26 2020 +1300 @@ -7,14 +7,49 @@ #include #include "dat.h" -char urlexpr[] = - "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)" - "://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)"; -Reprog *urlprog; - int inword = 0; int col = 0; int wordi = 0; +Rune* proto; +Rune* root; +Rune* base; + +void +setbaseurls(Rune *url) +{ + Rune *r; + + if(url == nil) + return; + free(proto); + free(root); + free(base); + + /* just a basic check... */ + for(r = url; isalpha(*r); r++) + ; + if(r == baseurl || r[0] != ':' || r[1] != '/' || r[2] != '/' || r[3] == 0){ + fprint(2, "%s: ignoring invalid base url: %S\n", argv0, url); + proto = root = base = nil; + return; + } + + r[1] = 0; + proto = runestrdup(url); + r[1] = '/'; + if(r = runestrchr(r + 3, '/')){ + *r = 0; + root = runestrdup(url); + *r = '/'; + r = runestrrchr(r, '/'); + *r = 0; + base = runestrdup(url); + *r = '/'; + return; + } + base = runestrdup(url); + root = runestrdup(url); +} char* loadhtml(int fd) @@ -27,7 +62,6 @@ u = emalloc(sizeof(URLwin)); u->infd = fd; u->outfd = 1; - u->url = estrdup(url); u->type = TextHtml; b = emalloc(sizeof(Bytes)); @@ -35,24 +69,13 @@ growbytes(b, buf, n); if(b->b == nil) return nil; /* empty file */ - rendertext(u, b); + u->items = parsehtml(b->b, b->n, baseurl, u->type, UTF_8, &u->docinfo); + setbaseurls(u->docinfo->base); + rerender(u); freeurlwin(u); return nil; } -char* -runetobyte(Rune *r, int n) -{ - char *s; - - if(n == 0) - return emalloc(1); - s = smprint("%.*S", n, r); - if(s == nil) - error("malloc failed"); - return s; -} - int closingpunct(char c) { @@ -129,58 +152,23 @@ free(r); } -char* -baseurl(char *url) +void +renderurl(Bytes *t, Rune *path, char lc, char rc) { - char *base, *slash; - Resub rs[10]; + Rune *r; - if(url == nil) - return nil; - if(urlprog == nil){ - urlprog = regcomp(urlexpr); - if(urlprog == nil) - error("can't compile URL regexp"); + if(path == nil){ + renderbytes(t, "%cnull_url%c", lc, rc); + return; } - memset(rs, 0, sizeof rs); - if(regexec(urlprog, url, rs, nelem(rs)) == 0) - return nil; - base = estrdup(url); - slash = strrchr(base, '/'); - if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp]) - *slash = '\0'; + for(r = path; isalpha(*r); r++) + ; + if(base == nil || r[0] == '#' || r > path && r[0] == ':' && r[1] == '/' && r[2] == '/' && r[3]) + renderbytes(t, "%c%S%c", lc, path, rc); + else if(path[0] == '/') + renderbytes(t, "%c%S%S%c", lc, path[1] == '/' ? proto : root, path, rc); else - base[rs[0].ep-rs[0].sp] = '\0'; - return base; -} - -char* -fullurl(URLwin *u, Rune *rhref) -{ - char *base, *href, *hrefbase; - char *result; - - if(rhref == nil) - return estrdup("NULL URL"); - href = runetobyte(rhref, runestrlen(rhref)); - hrefbase = baseurl(href); - result = nil; - if(hrefbase==nil && (base = baseurl(u->url))!=nil){ - result = estrdup(base); - if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/')) - result = eappend(result, "/", ""); - free(base); - } - if(href){ - if(result) - result = eappend(result, "", href); - else - result = estrdup(href); - } - free(hrefbase); - if(result == nil) - return estrdup("***unknown***"); - return result; + renderbytes(t, "%c%S/%S%c", lc, base, path, rc); } void @@ -195,11 +183,12 @@ Anchor *a; Table *tab; Tablecell *cell; - char *href; + int nimg; inword = 0; col = 0; wordi = 0; + nimg = 1; for(il=items; il!=nil; il=il->next){ if(il->state & IFbrk) @@ -221,17 +210,18 @@ renderbytes(t, "=======\n"); break; case Iimagetag: - if(!aflag) + if(links == NONE) break; im = (Iimage*)il; if(im->imsrc){ - href = fullurl(u, im->imsrc); - renderbytes(t, "[image %s]", href); - free(href); + if(links & FOOTNOTES) + renderbytes(t, "{%d}", nimg++); + else + renderurl(t, im->imsrc, '{', '}'); } break; case Iformfieldtag: - if(aflag) + if(links != NONE) renderbytes(t, "[formfield]"); break; case Itabletag: @@ -253,14 +243,15 @@ renderbytes(t, " "); break; default: - error("unknown item tag %d\n", il->tag); + sysfatal("unknown item tag %d\n", il->tag); } if(il->anchorid != 0 && il->anchorid!=curanchor){ for(a=u->docinfo->anchors; a!=nil; a=a->next) - if(aflag && a->index == il->anchorid){ - href = fullurl(u, a->href); - renderbytes(t, "[%s]", href); - free(href); + if(links != NONE && a->index == il->anchorid){ + if(links & FOOTNOTES) + renderbytes(t, "[%d]", a->index); + else + renderurl(t, a->href, '[', ']'); break; } curanchor = il->anchorid; @@ -271,13 +262,55 @@ } void +afootnotes(URLwin *u, Bytes *t){ + Anchor *x, *y, *z; + + x = u->docinfo->anchors; + if(x == nil) + return; + renderbytes(t, "\n\nlinks:\n"); + + /* list needs reversing */ + for(z = nil; x->next != nil; x = y){ + y = x->next; + x->next = z; + z = x; + } + for(x->next = z; x != nil; x = x->next){ + renderbytes(t, "[%d]", x->index); + renderurl(t, x->href, ' ', '\n'); + }; +} + +void +imgfootnotes(URLwin *u, Bytes *t){ + Iimage *i; + int n; + + i = u->docinfo->images; + if(i == nil) + return; + renderbytes(t, "\n\nimages:\n"); + for(n=1; i!=nil; i=i->nextimage){ + renderbytes(t, "{%d}", n++); + renderurl(t, i->imsrc, ' ', '\n'); + } +} + +void rerender(URLwin *u) { Bytes *t; t = emalloc(sizeof(Bytes)); + if(u->docinfo->doctitle!=nil) + renderbytes(t, "%S\n\n", u->docinfo->doctitle); render(u, t, u->items, 0); + if(links & FOOTNOTES){ + afootnotes(u, t); + imgfootnotes(u, t); + } if(t->n) write(u->outfd, (char*)t->b, t->n); @@ -286,19 +319,6 @@ } void -rendertext(URLwin *u, Bytes *b) -{ - Rune *rurl; - - rurl = toStr((uchar*)u->url, strlen(u->url), UTF_8); - u->items = parsehtml(b->b, b->n, rurl, u->type, UTF_8, &u->docinfo); -// free(rurl); - - rerender(u); -} - - -void freeurlwin(URLwin *u) { freeitems(u->items); diff -r b24b6b01d46a sys/src/cmd/htmlfmt/main.c --- a/sys/src/cmd/htmlfmt/main.c Tue Dec 29 19:38:59 2020 +0000 +++ b/sys/src/cmd/htmlfmt/main.c Thu Dec 31 16:08:26 2020 +1300 @@ -5,8 +5,8 @@ #include #include "dat.h" -char *url = ""; -int aflag; +Rune *baseurl; +int links; int width = 70; char *defcharset = "latin1"; @@ -53,11 +53,14 @@ ARGBEGIN{ case 'a': - aflag++; + links |= INLINE; break; case 'c': defcharset = EARGF(usage()); break; + case 'f': + links |= FOOTNOTES; + break; case 'l': case 'w': err = EARGF(usage()); width = atoi(err); @@ -65,8 +68,12 @@ usage(); break; case 'u': - url = EARGF(usage()); - aflag++; + err = EARGF(usage()); + free(baseurl); + baseurl = emalloc((utflen(err) + 1) * sizeof(Rune)); + for(i = 0; *err != '\0'; i++) + err += chartorune(baseurl + i, err); + links |= INLINE; break; default: usage(); diff -r b24b6b01d46a sys/src/cmd/htmlfmt/util.c --- a/sys/src/cmd/htmlfmt/util.c Tue Dec 29 19:38:59 2020 +0000 +++ b/sys/src/cmd/htmlfmt/util.c Thu Dec 31 16:08:26 2020 +1300 @@ -12,7 +12,7 @@ p = malloc(n); if(p == nil) - error("can't malloc: %r"); + sysfatal("malloc: %r"); memset(p, 0, n); return p; } @@ -22,88 +22,10 @@ { p = realloc(p, n); if(p == nil) - error("can't malloc: %r"); + sysfatal("realloc: %r"); return p; } -char* -estrdup(char *s) -{ - char *t; - - t = emalloc(strlen(s)+1); - strcpy(t, s); - return t; -} - -char* -estrstrdup(char *s, char *t) -{ - long ns, nt; - char *u; - - ns = strlen(s); - nt = strlen(t); - /* use malloc to avoid memset */ - u = malloc(ns+nt+1); - if(u == nil) - error("can't malloc: %r"); - memmove(u, s, ns); - memmove(u+ns, t, nt); - u[ns+nt] = '\0'; - return u; -} - -char* -eappend(char *s, char *sep, char *t) -{ - long ns, nsep, nt; - char *u; - - if(t == nil) - u = estrstrdup(s, sep); - else{ - ns = strlen(s); - nsep = strlen(sep); - nt = strlen(t); - /* use malloc to avoid memset */ - u = malloc(ns+nsep+nt+1); - if(u == nil) - error("can't malloc: %r"); - memmove(u, s, ns); - memmove(u+ns, sep, nsep); - memmove(u+ns+nsep, t, nt); - u[ns+nsep+nt] = '\0'; - } - free(s); - return u; -} - -char* -egrow(char *s, char *sep, char *t) -{ - s = eappend(s, sep, t); - free(t); - return s; -} - -void -error(char *fmt, ...) -{ - va_list arg; - char buf[256]; - Fmt f; - - fmtfdinit(&f, 2, buf, sizeof buf); - fmtprint(&f, "Mail: "); - va_start(arg, fmt); - fmtvprint(&f, fmt, arg); - va_end(arg); - fmtprint(&f, "\n"); - fmtfdflush(&f); - exits(fmt); -} - void growbytes(Bytes *b, char *s, long ns) { @@ -112,7 +34,7 @@ /* use realloc to avoid memset */ b->b = realloc(b->b, b->nalloc); if(b->b == nil) - error("growbytes: can't realloc: %r"); + sysfatal("growbytes: can't realloc: %r"); } memmove(b->b+b->n, s, ns); b->n += ns;