9front - general discussion about 9front
 help / color / mirror / Atom feed
From: umbraticus@prosimetrum.com
To: 9front@9front.org
Subject: Re: [9front] htmlfmt anchor corner cases
Date: Thu, 31 Dec 2020 22:42:46 +1300	[thread overview]
Message-ID: <6980D1E3953AA95A3C3A4D2DC9967FEA@prosimetrum.com> (raw)
In-Reply-To: <A7D190FBBAB6CFB698B84019E74DE58A@prosimetrum.com>

This patch makes the following changes to htmlfmt:

• Print image src like {url} instead of [image url]
• Properly combine rooted paths with base url
• Handle “protocol relative” urls
• Respect <base> tag
• Print document title at top
• Implement footnote mode -f
• Remove unused crap

umbraticus

diff -r b24b6b01d46a sys/src/cmd/htmlfmt/dat.h
--- a/sys/src/cmd/htmlfmt/dat.h	Tue Dec 29 19:38:59 2020 +0000
+++ b/sys/src/cmd/htmlfmt/dat.h	Thu Dec 31 16:08:26 2020 +1300
@@ -3,6 +3,7 @@
 
 enum
 {
+	NONE, INLINE, FOOTNOTES,
 	STACK		= 8192,
 	EVENTSIZE	= 256,
 };
@@ -20,29 +21,15 @@
 	int		outfd;
 	int		type;
 
-	char		*url;
 	Item		*items;
 	Docinfo	*docinfo;
 };
 
-extern	char*	url;
-extern	int		aflag;
+extern	Rune*	baseurl;
+extern	int		links;
 extern	int		width;
 
 extern	char*	loadhtml(int);
-
-extern	char*	readfile(char*, char*, int*);
-extern	void*	emalloc(ulong);
-extern	char*	estrdup(char*);
-extern	char*	estrstrdup(char*, char*);
-extern	char*	egrow(char*, char*, char*);
-extern	char*	eappend(char*, char*, char*);
-extern	void		error(char*, ...);
-
 extern	void		growbytes(Bytes*, char*, long);
-
-extern	void		rendertext(URLwin*, Bytes*);
 extern	void		rerender(URLwin*);
 extern	void		freeurlwin(URLwin*);
-
-#pragma	varargck	argpos	error	1
diff -r b24b6b01d46a sys/src/cmd/htmlfmt/html.c
--- a/sys/src/cmd/htmlfmt/html.c	Tue Dec 29 19:38:59 2020 +0000
+++ b/sys/src/cmd/htmlfmt/html.c	Thu Dec 31 16:08:26 2020 +1300
@@ -7,14 +7,49 @@
 #include <ctype.h>
 #include "dat.h"
 
-char urlexpr[] =
-	"^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)"
-	"://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
-Reprog	*urlprog;
-
 int inword = 0;
 int col = 0;
 int wordi = 0;
+Rune* proto;
+Rune* root;
+Rune* base;
+
+void
+setbaseurls(Rune *url)
+{
+	Rune *r;
+
+	if(url == nil)
+		return;
+	free(proto);
+	free(root);
+	free(base);
+
+	/* just a basic check... */
+	for(r = url; isalpha(*r); r++)
+		;
+	if(r == baseurl || r[0] != ':' || r[1] != '/' || r[2] != '/' || r[3] == 0){
+		fprint(2, "%s: ignoring invalid base url: %S\n", argv0, url);
+		proto = root = base = nil;
+		return;
+	}
+
+	r[1] = 0;
+	proto = runestrdup(url);
+	r[1] = '/';
+	if(r = runestrchr(r + 3, '/')){
+		*r = 0;
+		root = runestrdup(url);
+		*r = '/';
+		r = runestrrchr(r, '/');
+		*r = 0;
+		base = runestrdup(url);
+		*r = '/';
+		return;
+	}
+	base = runestrdup(url);
+	root = runestrdup(url);
+}
 
 char*
 loadhtml(int fd)
@@ -27,7 +62,6 @@
 	u = emalloc(sizeof(URLwin));
 	u->infd = fd;
 	u->outfd = 1;
-	u->url = estrdup(url);
 	u->type = TextHtml;
 
 	b = emalloc(sizeof(Bytes));
@@ -35,24 +69,13 @@
 		growbytes(b, buf, n);
 	if(b->b == nil)
 		return nil;	/* empty file */
-	rendertext(u, b);
+	u->items = parsehtml(b->b, b->n, baseurl, u->type, UTF_8, &u->docinfo);
+	setbaseurls(u->docinfo->base);
+	rerender(u);
 	freeurlwin(u);
 	return nil;
 }
 
-char*
-runetobyte(Rune *r, int n)
-{
-	char *s;
-
-	if(n == 0)
-		return emalloc(1);
-	s = smprint("%.*S", n, r);
-	if(s == nil)
-		error("malloc failed");
-	return s;
-}
-
 int
 closingpunct(char c)
 {
@@ -129,58 +152,23 @@
 	free(r);
 }
 
-char*
-baseurl(char *url)
+void
+renderurl(Bytes *t, Rune *path, char lc, char rc)
 {
-	char *base, *slash;
-	Resub rs[10];
+	Rune *r;
 
-	if(url == nil)
-		return nil;
-	if(urlprog == nil){
-		urlprog = regcomp(urlexpr);
-		if(urlprog == nil)
-			error("can't compile URL regexp");
+	if(path == nil){
+		renderbytes(t, "%cnull_url%c", lc, rc);
+		return;
 	}
-	memset(rs, 0, sizeof rs);
-	if(regexec(urlprog, url, rs, nelem(rs)) == 0)
-		return nil;
-	base = estrdup(url);
-	slash = strrchr(base, '/');
-	if(slash!=nil && slash>=&base[rs[0].ep-rs[0].sp])
-		*slash = '\0';
+	for(r = path; isalpha(*r); r++)
+		;
+	if(base == nil || r[0] == '#' || r > path && r[0] == ':' && r[1] == '/' && r[2] == '/' && r[3])
+		renderbytes(t, "%c%S%c", lc, path, rc);
+	else if(path[0] == '/')
+		renderbytes(t, "%c%S%S%c", lc, path[1] == '/' ? proto : root, path, rc);
 	else
-		base[rs[0].ep-rs[0].sp] = '\0';
-	return base;
-}
-
-char*
-fullurl(URLwin *u, Rune *rhref)
-{
-	char *base, *href, *hrefbase;
-	char *result;
-
-	if(rhref == nil)
-		return estrdup("NULL URL");
-	href = runetobyte(rhref, runestrlen(rhref));
-	hrefbase = baseurl(href);
-	result = nil;
-	if(hrefbase==nil && (base = baseurl(u->url))!=nil){
-		result = estrdup(base);
-		if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
-			result = eappend(result, "/", "");
-		free(base);
-	}
-	if(href){
-		if(result)
-			result = eappend(result, "", href);
-		else
-			result = estrdup(href);
-	}
-	free(hrefbase);
-	if(result == nil)
-		return estrdup("***unknown***");
-	return result;
+		renderbytes(t, "%c%S/%S%c", lc, base, path, rc);
 }
 
 void
@@ -195,11 +183,12 @@
 	Anchor *a;
 	Table *tab;
 	Tablecell *cell;
-	char *href;
+	int nimg;
 
 	inword = 0;
 	col = 0;
 	wordi = 0;
+	nimg = 1;
 
 	for(il=items; il!=nil; il=il->next){
 		if(il->state & IFbrk)
@@ -221,17 +210,18 @@
 			renderbytes(t, "=======\n");
 			break;
 		case Iimagetag:
-			if(!aflag)
+			if(links == NONE)
 				break;
 			im = (Iimage*)il;
 			if(im->imsrc){
-				href = fullurl(u, im->imsrc);
-				renderbytes(t, "[image %s]", href);
-				free(href);
+				if(links & FOOTNOTES)
+					renderbytes(t, "{%d}", nimg++);
+				else
+					renderurl(t, im->imsrc, '{', '}');
 			}
 			break;
 		case Iformfieldtag:
-			if(aflag)
+			if(links != NONE)
 				renderbytes(t, "[formfield]");
 			break;
 		case Itabletag:
@@ -253,14 +243,15 @@
 				renderbytes(t, " ");
 			break;
 		default:
-			error("unknown item tag %d\n", il->tag);
+			sysfatal("unknown item tag %d\n", il->tag);
 		}
 		if(il->anchorid != 0 && il->anchorid!=curanchor){
 			for(a=u->docinfo->anchors; a!=nil; a=a->next)
-				if(aflag && a->index == il->anchorid){
-					href = fullurl(u, a->href);
-					renderbytes(t, "[%s]", href);
-					free(href);
+				if(links != NONE && a->index == il->anchorid){
+					if(links & FOOTNOTES)
+						renderbytes(t, "[%d]", a->index);
+					else
+						renderurl(t, a->href, '[', ']');
 					break;
 				}
 			curanchor = il->anchorid;
@@ -271,13 +262,55 @@
 }
 
 void
+afootnotes(URLwin *u, Bytes *t){
+	Anchor *x, *y, *z;
+
+	x = u->docinfo->anchors;
+	if(x == nil)
+		return;
+	renderbytes(t, "\n\nlinks:\n");
+
+	/* list needs reversing */
+	for(z = nil; x->next != nil; x = y){
+		y = x->next;
+		x->next = z;
+		z = x;
+	}
+	for(x->next = z; x != nil; x = x->next){
+		renderbytes(t, "[%d]", x->index);
+		renderurl(t, x->href, ' ', '\n');
+	};
+}
+
+void
+imgfootnotes(URLwin *u, Bytes *t){
+	Iimage *i;
+	int n;
+
+	i = u->docinfo->images;
+	if(i == nil)
+		return;
+	renderbytes(t, "\n\nimages:\n");
+	for(n=1; i!=nil; i=i->nextimage){
+		renderbytes(t, "{%d}", n++);
+		renderurl(t, i->imsrc, ' ', '\n');
+	}
+}
+
+void
 rerender(URLwin *u)
 {
 	Bytes *t;
 
 	t = emalloc(sizeof(Bytes));
 
+	if(u->docinfo->doctitle!=nil)
+		renderbytes(t, "%S\n\n", u->docinfo->doctitle);
 	render(u, t, u->items, 0);
+	if(links & FOOTNOTES){
+		afootnotes(u, t);
+		imgfootnotes(u, t);
+	}
 
 	if(t->n)
 		write(u->outfd, (char*)t->b, t->n);
@@ -286,19 +319,6 @@
 }
 
 void
-rendertext(URLwin *u, Bytes *b)
-{
-	Rune *rurl;
-
-	rurl = toStr((uchar*)u->url, strlen(u->url), UTF_8);
-	u->items = parsehtml(b->b, b->n, rurl, u->type, UTF_8, &u->docinfo);
-//	free(rurl);
-
-	rerender(u);
-}
-
-
-void
 freeurlwin(URLwin *u)
 {
 	freeitems(u->items);
diff -r b24b6b01d46a sys/src/cmd/htmlfmt/main.c
--- a/sys/src/cmd/htmlfmt/main.c	Tue Dec 29 19:38:59 2020 +0000
+++ b/sys/src/cmd/htmlfmt/main.c	Thu Dec 31 16:08:26 2020 +1300
@@ -5,8 +5,8 @@
 #include <html.h>
 #include "dat.h"
 
-char *url = "";
-int aflag;
+Rune *baseurl;
+int links;
 int width = 70;
 char *defcharset = "latin1";
 
@@ -53,11 +53,14 @@
 
 	ARGBEGIN{
 	case 'a':
-		aflag++;
+		links |= INLINE;
 		break;
 	case 'c':
 		defcharset = EARGF(usage());
 		break;
+	case 'f':
+		links |= FOOTNOTES;
+		break;
 	case 'l': case 'w':
 		err = EARGF(usage());
 		width = atoi(err);
@@ -65,8 +68,12 @@
 			usage();
 		break;
 	case 'u':
-		url = EARGF(usage());
-		aflag++;
+		err = EARGF(usage());
+		free(baseurl);
+		baseurl = emalloc((utflen(err) + 1) * sizeof(Rune));
+		for(i = 0; *err != '\0'; i++)
+			err += chartorune(baseurl + i, err);
+		links |= INLINE;
 		break;
 	default:
 		usage();
diff -r b24b6b01d46a sys/src/cmd/htmlfmt/util.c
--- a/sys/src/cmd/htmlfmt/util.c	Tue Dec 29 19:38:59 2020 +0000
+++ b/sys/src/cmd/htmlfmt/util.c	Thu Dec 31 16:08:26 2020 +1300
@@ -12,7 +12,7 @@
 
 	p = malloc(n);
 	if(p == nil)
-		error("can't malloc: %r");
+		sysfatal("malloc: %r");
 	memset(p, 0, n);
 	return p;
 }
@@ -22,88 +22,10 @@
 {
 	p = realloc(p, n);
 	if(p == nil)
-		error("can't malloc: %r");
+		sysfatal("realloc: %r");
 	return p;
 }
 
-char*
-estrdup(char *s)
-{
-	char *t;
-
-	t = emalloc(strlen(s)+1);
-	strcpy(t, s);
-	return t;
-}
-
-char*
-estrstrdup(char *s, char *t)
-{
-	long ns, nt;
-	char *u;
-
-	ns = strlen(s);
-	nt = strlen(t);
-	/* use malloc to avoid memset */
-	u = malloc(ns+nt+1);
-	if(u == nil)
-		error("can't malloc: %r");
-	memmove(u, s, ns);
-	memmove(u+ns, t, nt);
-	u[ns+nt] = '\0';
-	return u;
-}
-
-char*
-eappend(char *s, char *sep, char *t)
-{
-	long ns, nsep, nt;
-	char *u;
-
-	if(t == nil)
-		u = estrstrdup(s, sep);
-	else{
-		ns = strlen(s);
-		nsep = strlen(sep);
-		nt = strlen(t);
-		/* use malloc to avoid memset */
-		u = malloc(ns+nsep+nt+1);
-		if(u == nil)
-			error("can't malloc: %r");
-		memmove(u, s, ns);
-		memmove(u+ns, sep, nsep);
-		memmove(u+ns+nsep, t, nt);
-		u[ns+nsep+nt] = '\0';
-	}
-	free(s);
-	return u;
-}
-
-char*
-egrow(char *s, char *sep, char *t)
-{
-	s = eappend(s, sep, t);
-	free(t);
-	return s;
-}
-
-void
-error(char *fmt, ...)
-{
-	va_list arg;
-	char buf[256];
-	Fmt f;
-
-	fmtfdinit(&f, 2, buf, sizeof buf);
-	fmtprint(&f, "Mail: ");
-	va_start(arg, fmt);
-	fmtvprint(&f, fmt, arg);
-	va_end(arg);
-	fmtprint(&f, "\n");
-	fmtfdflush(&f);
-	exits(fmt);
-}
-
 void
 growbytes(Bytes *b, char *s, long ns)
 {
@@ -112,7 +34,7 @@
 		/* use realloc to avoid memset */
 		b->b = realloc(b->b, b->nalloc);
 		if(b->b == nil)
-			error("growbytes: can't realloc: %r");
+			sysfatal("growbytes: can't realloc: %r");
 	}
 	memmove(b->b+b->n, s, ns);
 	b->n += ns;

  reply	other threads:[~2020-12-31  9:43 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-20  9:29 umbraticus
2020-12-20 22:03 ` cinap_lenrek
2020-12-30  3:47   ` umbraticus
2020-12-31  9:42     ` umbraticus [this message]
2021-01-01  4:42       ` umbraticus
2021-01-01 10:05         ` Steve Simon
2021-01-01 19:26         ` ori
2021-01-20  2:20         ` ori
2021-01-20  2:49           ` Alex Musolino
2021-01-20  3:17           ` umbraticus
2021-01-24  5:46             ` umbraticus
2021-01-24 23:51               ` ori
2021-01-25 18:42                 ` umbraticus
2021-01-27  2:42               ` ori
2021-08-28 20:22               ` Stuart Morrow
2021-08-29  1:52                 ` umbraticus
2021-08-29  2:08                   ` ori

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=6980D1E3953AA95A3C3A4D2DC9967FEA@prosimetrum.com \
    --to=umbraticus@prosimetrum.com \
    --cc=9front@9front.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).