From: umbraticus@prosimetrum.com
To: 9front@9front.org
Subject: Re: [9front] htmlfmt anchor corner cases
Date: Thu, 31 Dec 2020 22:42:46 +1300 [thread overview]
Message-ID: <6980D1E3953AA95A3C3A4D2DC9967FEA@prosimetrum.com> (raw)
In-Reply-To: <A7D190FBBAB6CFB698B84019E74DE58A@prosimetrum.com>
This patch makes the following changes to htmlfmt:
• Print image src like {url} instead of [image url]
• Properly combine rooted paths with base url
• Handle “protocol relative” urls
• Respect <base> tag
• Print document title at top
• Implement footnote mode -f
• Remove unused crap
umbraticus
diff -r b24b6b01d46a sys/src/cmd/htmlfmt/dat.h
--- a/sys/src/cmd/htmlfmt/dat.h Tue Dec 29 19:38:59 2020 +0000
+++ b/sys/src/cmd/htmlfmt/dat.h Thu Dec 31 16:08:26 2020 +1300
@@ -3,6 +3,7 @@
enum
{
+ NONE, INLINE, FOOTNOTES,
STACK = 8192,
EVENTSIZE = 256,
};
@@ -20,29 +21,15 @@
int outfd;
int type;
- char *url;
Item *items;
Docinfo *docinfo;
};
-extern char* url;
-extern int aflag;
+extern Rune* baseurl;
+extern int links;
extern int width;
extern char* loadhtml(int);
-
-extern char* readfile(char*, char*, int*);
-extern void* emalloc(ulong);
-extern char* estrdup(char*);
-extern char* estrstrdup(char*, char*);
-extern char* egrow(char*, char*, char*);
-extern char* eappend(char*, char*, char*);
-extern void error(char*, ...);
-
extern void growbytes(Bytes*, char*, long);
-
-extern void rendertext(URLwin*, Bytes*);
extern void rerender(URLwin*);
extern void freeurlwin(URLwin*);
-
-#pragma varargck argpos error 1
diff -r b24b6b01d46a sys/src/cmd/htmlfmt/html.c
--- a/sys/src/cmd/htmlfmt/html.c Tue Dec 29 19:38:59 2020 +0000
+++ b/sys/src/cmd/htmlfmt/html.c Thu Dec 31 16:08:26 2020 +1300
@@ -7,14 +7,49 @@
#include <ctype.h>
#include "dat.h"
-char urlexpr[] =
- "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)"
- "://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
-Reprog *urlprog;
-
int inword = 0;
int col = 0;
int wordi = 0;
+Rune* proto;
+Rune* root;
+Rune* base;
+
+void
+setbaseurls(Rune *url)
+{
+ Rune *r;
+
+ if(url == nil)
+ return;
+ free(proto);
+ free(root);
+ free(base);
+
+ /* just a basic check... */
+ for(r = url; isalpha(*r); r++)
+ ;
+ if(r == baseurl || r[0] != ':' || r[1] != '/' || r[2] != '/' || r[3] == 0){
+ fprint(2, "%s: ignoring invalid base url: %S\n", argv0, url);
+ proto = root = base = nil;
+ return;
+ }
+
+ r[1] = 0;
+ proto = runestrdup(url);
+ r[1] = '/';
+ if(r = runestrchr(r + 3, '/')){
+ *r = 0;
+ root = runestrdup(url);
+ *r = '/';
+ r = runestrrchr(r, '/');
+ *r = 0;
+ base = runestrdup(url);
+ *r = '/';
+ return;
+ }
+ base = runestrdup(url);
+ root = runestrdup(url);
+}
char*
loadhtml(int fd)
@@ -27,7 +62,6 @@
u = emalloc(sizeof(URLwin));
u->infd = fd;
u->outfd = 1;
- u->url = estrdup(url);
u->type = TextHtml;
b = emalloc(sizeof(Bytes));
@@ -35,24 +69,13 @@
growbytes(b, buf, n);
if(b->b == nil)
return nil; /* empty file */
- rendertext(u, b);
+ u->items = parsehtml(b->b, b->n, baseurl, u->type, UTF_8, &u->docinfo);
+ setbaseurls(u->docinfo->base);
+ rerender(u);
freeurlwin(u);
return nil;
}
-char*
-runetobyte(Rune *r, int n)
-{
- char *s;
-
- if(n == 0)
- return emalloc(1);
- s = smprint("%.*S", n, r);
- if(s == nil)
- error("malloc failed");
- return s;
-}
-
int
closingpunct(char c)
{
@@ -129,58 +152,23 @@
free(r);
}
-char*
-baseurl(char *url)
+void
+renderurl(Bytes *t, Rune *path, char lc, char rc)
{
- char *base, *slash;
- Resub rs[10];
+ Rune *r;
- if(url == nil)
- return nil;
- if(urlprog == nil){
- urlprog = regcomp(urlexpr);
- if(urlprog == nil)
- error("can't compile URL regexp");
+ if(path == nil){
+ renderbytes(t, "%cnull_url%c", lc, rc);
+ return;
}
- memset(rs, 0, sizeof rs);
- if(regexec(urlprog, url, rs, nelem(rs)) == 0)
- return nil;
- base = estrdup(url);
- slash = strrchr(base, '/');
- if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp])
- *slash = '\0';
+ for(r = path; isalpha(*r); r++)
+ ;
+ if(base == nil || r[0] == '#' || r > path && r[0] == ':' && r[1] == '/' && r[2] == '/' && r[3])
+ renderbytes(t, "%c%S%c", lc, path, rc);
+ else if(path[0] == '/')
+ renderbytes(t, "%c%S%S%c", lc, path[1] == '/' ? proto : root, path, rc);
else
- base[rs[0].ep-rs[0].sp] = '\0';
- return base;
-}
-
-char*
-fullurl(URLwin *u, Rune *rhref)
-{
- char *base, *href, *hrefbase;
- char *result;
-
- if(rhref == nil)
- return estrdup("NULL URL");
- href = runetobyte(rhref, runestrlen(rhref));
- hrefbase = baseurl(href);
- result = nil;
- if(hrefbase==nil && (base = baseurl(u->url))!=nil){
- result = estrdup(base);
- if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
- result = eappend(result, "/", "");
- free(base);
- }
- if(href){
- if(result)
- result = eappend(result, "", href);
- else
- result = estrdup(href);
- }
- free(hrefbase);
- if(result == nil)
- return estrdup("***unknown***");
- return result;
+ renderbytes(t, "%c%S/%S%c", lc, base, path, rc);
}
void
@@ -195,11 +183,12 @@
Anchor *a;
Table *tab;
Tablecell *cell;
- char *href;
+ int nimg;
inword = 0;
col = 0;
wordi = 0;
+ nimg = 1;
for(il=items; il!=nil; il=il->next){
if(il->state & IFbrk)
@@ -221,17 +210,18 @@
renderbytes(t, "=======\n");
break;
case Iimagetag:
- if(!aflag)
+ if(links == NONE)
break;
im = (Iimage*)il;
if(im->imsrc){
- href = fullurl(u, im->imsrc);
- renderbytes(t, "[image %s]", href);
- free(href);
+ if(links & FOOTNOTES)
+ renderbytes(t, "{%d}", nimg++);
+ else
+ renderurl(t, im->imsrc, '{', '}');
}
break;
case Iformfieldtag:
- if(aflag)
+ if(links != NONE)
renderbytes(t, "[formfield]");
break;
case Itabletag:
@@ -253,14 +243,15 @@
renderbytes(t, " ");
break;
default:
- error("unknown item tag %d\n", il->tag);
+ sysfatal("unknown item tag %d\n", il->tag);
}
if(il->anchorid != 0 && il->anchorid!=curanchor){
for(a=u->docinfo->anchors; a!=nil; a=a->next)
- if(aflag && a->index == il->anchorid){
- href = fullurl(u, a->href);
- renderbytes(t, "[%s]", href);
- free(href);
+ if(links != NONE && a->index == il->anchorid){
+ if(links & FOOTNOTES)
+ renderbytes(t, "[%d]", a->index);
+ else
+ renderurl(t, a->href, '[', ']');
break;
}
curanchor = il->anchorid;
@@ -271,13 +262,55 @@
}
void
+afootnotes(URLwin *u, Bytes *t){
+ Anchor *x, *y, *z;
+
+ x = u->docinfo->anchors;
+ if(x == nil)
+ return;
+ renderbytes(t, "\n\nlinks:\n");
+
+ /* list needs reversing */
+ for(z = nil; x->next != nil; x = y){
+ y = x->next;
+ x->next = z;
+ z = x;
+ }
+ for(x->next = z; x != nil; x = x->next){
+ renderbytes(t, "[%d]", x->index);
+ renderurl(t, x->href, ' ', '\n');
+ };
+}
+
+void
+imgfootnotes(URLwin *u, Bytes *t){
+ Iimage *i;
+ int n;
+
+ i = u->docinfo->images;
+ if(i == nil)
+ return;
+ renderbytes(t, "\n\nimages:\n");
+ for(n=1; i!=nil; i=i->nextimage){
+ renderbytes(t, "{%d}", n++);
+ renderurl(t, i->imsrc, ' ', '\n');
+ }
+}
+
+void
rerender(URLwin *u)
{
Bytes *t;
t = emalloc(sizeof(Bytes));
+ if(u->docinfo->doctitle!=nil)
+ renderbytes(t, "%S\n\n", u->docinfo->doctitle);
render(u, t, u->items, 0);
+ if(links & FOOTNOTES){
+ afootnotes(u, t);
+ imgfootnotes(u, t);
+ }
if(t->n)
write(u->outfd, (char*)t->b, t->n);
@@ -286,19 +319,6 @@
}
void
-rendertext(URLwin *u, Bytes *b)
-{
- Rune *rurl;
-
- rurl = toStr((uchar*)u->url, strlen(u->url), UTF_8);
- u->items = parsehtml(b->b, b->n, rurl, u->type, UTF_8, &u->docinfo);
-// free(rurl);
-
- rerender(u);
-}
-
-
-void
freeurlwin(URLwin *u)
{
freeitems(u->items);
diff -r b24b6b01d46a sys/src/cmd/htmlfmt/main.c
--- a/sys/src/cmd/htmlfmt/main.c Tue Dec 29 19:38:59 2020 +0000
+++ b/sys/src/cmd/htmlfmt/main.c Thu Dec 31 16:08:26 2020 +1300
@@ -5,8 +5,8 @@
#include <html.h>
#include "dat.h"
-char *url = "";
-int aflag;
+Rune *baseurl;
+int links;
int width = 70;
char *defcharset = "latin1";
@@ -53,11 +53,14 @@
ARGBEGIN{
case 'a':
- aflag++;
+ links |= INLINE;
break;
case 'c':
defcharset = EARGF(usage());
break;
+ case 'f':
+ links |= FOOTNOTES;
+ break;
case 'l': case 'w':
err = EARGF(usage());
width = atoi(err);
@@ -65,8 +68,12 @@
usage();
break;
case 'u':
- url = EARGF(usage());
- aflag++;
+ err = EARGF(usage());
+ free(baseurl);
+ baseurl = emalloc((utflen(err) + 1) * sizeof(Rune));
+ for(i = 0; *err != '\0'; i++)
+ err += chartorune(baseurl + i, err);
+ links |= INLINE;
break;
default:
usage();
diff -r b24b6b01d46a sys/src/cmd/htmlfmt/util.c
--- a/sys/src/cmd/htmlfmt/util.c Tue Dec 29 19:38:59 2020 +0000
+++ b/sys/src/cmd/htmlfmt/util.c Thu Dec 31 16:08:26 2020 +1300
@@ -12,7 +12,7 @@
p = malloc(n);
if(p == nil)
- error("can't malloc: %r");
+ sysfatal("malloc: %r");
memset(p, 0, n);
return p;
}
@@ -22,88 +22,10 @@
{
p = realloc(p, n);
if(p == nil)
- error("can't malloc: %r");
+ sysfatal("realloc: %r");
return p;
}
-char*
-estrdup(char *s)
-{
- char *t;
-
- t = emalloc(strlen(s)+1);
- strcpy(t, s);
- return t;
-}
-
-char*
-estrstrdup(char *s, char *t)
-{
- long ns, nt;
- char *u;
-
- ns = strlen(s);
- nt = strlen(t);
- /* use malloc to avoid memset */
- u = malloc(ns+nt+1);
- if(u == nil)
- error("can't malloc: %r");
- memmove(u, s, ns);
- memmove(u+ns, t, nt);
- u[ns+nt] = '\0';
- return u;
-}
-
-char*
-eappend(char *s, char *sep, char *t)
-{
- long ns, nsep, nt;
- char *u;
-
- if(t == nil)
- u = estrstrdup(s, sep);
- else{
- ns = strlen(s);
- nsep = strlen(sep);
- nt = strlen(t);
- /* use malloc to avoid memset */
- u = malloc(ns+nsep+nt+1);
- if(u == nil)
- error("can't malloc: %r");
- memmove(u, s, ns);
- memmove(u+ns, sep, nsep);
- memmove(u+ns+nsep, t, nt);
- u[ns+nsep+nt] = '\0';
- }
- free(s);
- return u;
-}
-
-char*
-egrow(char *s, char *sep, char *t)
-{
- s = eappend(s, sep, t);
- free(t);
- return s;
-}
-
-void
-error(char *fmt, ...)
-{
- va_list arg;
- char buf[256];
- Fmt f;
-
- fmtfdinit(&f, 2, buf, sizeof buf);
- fmtprint(&f, "Mail: ");
- va_start(arg, fmt);
- fmtvprint(&f, fmt, arg);
- va_end(arg);
- fmtprint(&f, "\n");
- fmtfdflush(&f);
- exits(fmt);
-}
-
void
growbytes(Bytes *b, char *s, long ns)
{
@@ -112,7 +34,7 @@
/* use realloc to avoid memset */
b->b = realloc(b->b, b->nalloc);
if(b->b == nil)
- error("growbytes: can't realloc: %r");
+ sysfatal("growbytes: can't realloc: %r");
}
memmove(b->b+b->n, s, ns);
b->n += ns;
next prev parent reply other threads:[~2020-12-31 9:43 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-12-20 9:29 umbraticus
2020-12-20 22:03 ` cinap_lenrek
2020-12-30 3:47 ` umbraticus
2020-12-31 9:42 ` umbraticus [this message]
2021-01-01 4:42 ` umbraticus
2021-01-01 10:05 ` Steve Simon
2021-01-01 19:26 ` ori
2021-01-20 2:20 ` ori
2021-01-20 2:49 ` Alex Musolino
2021-01-20 3:17 ` umbraticus
2021-01-24 5:46 ` umbraticus
2021-01-24 23:51 ` ori
2021-01-25 18:42 ` umbraticus
2021-01-27 2:42 ` ori
2021-08-28 20:22 ` Stuart Morrow
2021-08-29 1:52 ` umbraticus
2021-08-29 2:08 ` ori
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=6980D1E3953AA95A3C3A4D2DC9967FEA@prosimetrum.com \
--to=umbraticus@prosimetrum.com \
--cc=9front@9front.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).