9front - general discussion about 9front
 help / color / mirror / Atom feed
From: umbraticus@prosimetrum.com
To: 9front@9front.org
Subject: Re: [9front] htmlfmt anchor corner cases
Date: Wed, 30 Dec 2020 16:47:40 +1300	[thread overview]
Message-ID: <A7D190FBBAB6CFB698B84019E74DE58A@prosimetrum.com> (raw)
In-Reply-To: <C575CC17B8178CBD8E941B2D6D2F2824@felloff.net>

> i do not like this part so much.
> 
> +		if(href!=nil && href[0]=='/'
> +		&& (base = strchr(result, ':')) != nil
> +		&& (base = strchr(base+3, '/')) != nil)
> +			*base = '\0';
> 
> the issue is htmlfmt's code to combine relative
> urls is just wrong. handling urls can be hard.
> 
> but adding hacks like these does not solve the
> problem sufficiently. theres code in webfs that
> might be of help.
> 
> maybe we should make a version for libhtml,
> which htmlfmt uses.

Yes, each program that uses libhtml does its own thing:

/sys/src/cmd/htmlfmt/html.c:/^fullurl
/sys/src/cmd/abaco/urls.c:/^urlcombine	quote: /* this is a HACK */
/sys/src/cmd/mothra/url.c:/^fileget
/sys/src/cmd/mothra/url.c:/^webclone ← makes use of webfs

Below is a patch that makes urls absolute when possible during parsing.
I'm not sure if the comment // FOR NOW: leave the url relative.
indicates that this was intended all along...

This would obviate the functions in htmlfmt and abaco but mothra
doesn't even use parsehtml so...  is it even worth it?  I'll have
another go at tidying up htmlfmt and send a separate email.

umbraticus

diff -r f4a5c13bcd43 sys/src/libhtml/build.c
--- a/sys/src/libhtml/build.c	Mon Dec 28 12:24:47 2020 +0100
+++ b/sys/src/libhtml/build.c	Wed Dec 30 16:12:33 2020 +1300
@@ -309,7 +309,6 @@
 static void			pushfontstyle(Pstate* ps, int sty);
 static void			pushjust(Pstate* ps, int j);
 static Item*		textit(Pstate* ps, Rune* s);
-static Rune*		removeallwhite(Rune* s);
 static void			resetdocinfo(Docinfo* d);
 static void			setcurfont(Pstate* ps);
 static void			setcurjust(Pstate* ps);
@@ -425,6 +424,40 @@
 
 static Item *getitems(ItemSource* is, uchar* data, int datalen);
 
+// Return malloced url, given (possibly empty) path and base.
+// A relative path and absolute base are combined; otherwise, path is returned as is.
+// If path is nil and base absolute, base up to final slash is returned.
+// URL strings are not validated any further than checking for proto://
+Rune*
+_fullurl(Rune *path, Rune *base)
+{
+	Rune *r;
+
+	if(path != nil){
+		for(r = path; isalpha(*r); r++)
+			;
+		if(r > path && *r++ == ':' && *r++ == '/' && *r == '/')
+			return _Strdup(path);	/* path is already absolute */
+	}
+	if(base == nil)
+		return _Strdup(path);
+	for(r = base; isalpha(*r); r++)
+		;
+	if(r == base || *r++ != ':' || *r++ != '/' || *r++ != '/')
+		return _Strdup(path);	/* bad base url proto */
+	while(isalnum(*r) || *r == '_' || *r == '@' || *r == '-' || *r == ':' || *r == '.')
+		r++;
+	if(r[-1] == '/' || *r && *r != '/')
+		return _Strdup(path);	/* bad base url hostname */
+	if(*r == '/' && (path == nil || *path != '/'))
+		r = runestrrchr(r, '/');	/* find final slash if path is not rooted */
+	if(path == nil)
+		return runesmprint("%.*S/", (int)(r - base), base);
+	if(*path == '/')
+		path++;
+	return runesmprint("%.*S/%S", (int)(r - base), base, path);
+}
+
 // Parse an html document and create a list of layout items.
 // Allocate and return document info in *pdi.
 // When caller is done with the items, it should call
@@ -439,7 +472,7 @@
 
 	di = newdocinfo();
 	di->src = _Strdup(pagesrc);
-	di->base = _Strdup(pagesrc);
+	di->base = _fullurl(nil, pagesrc);
 	di->mediatype = mtype;
 	di->chset = chset;
 	*pdi = di;
@@ -2923,55 +2956,18 @@
 	return ans;
 }
 
-// Attribute value when value is a URL, possibly relative to base.
-// FOR NOW: leave the url relative.
+// Attribute value when value is a URL.
+// Relative URLs are converted to absolute if a suitable base is given.
 // Caller must free the result (eventually).
 static Rune*
 aurlval(Token* tok, int attid, Rune* dflt, Rune* base)
 {
 	Rune*	ans;
-	Rune*	url;
-
-	USED(base);
-	ans = nil;
-	if(_tokaval(tok, attid, &url, 0) && url != nil)
-		ans = removeallwhite(url);
+
+	_tokaval(tok, attid, &ans, 0);
 	if(ans == nil)
-		ans = _Strdup(dflt);
-	return ans;
-}
-
-// Return copy of s but with all whitespace (even internal) removed.
-// This fixes some buggy URL specification strings.
-static Rune*
-removeallwhite(Rune* s)
-{
-	int	j;
-	int	n;
-	int	i;
-	int	c;
-	Rune*	ans;
-
-	j = 0;
-	n = _Strlen(s);
-	for(i = 0; i < n; i++) {
-		c = s[i];
-		if(c >= 256 || !isspace(c))
-			j++;
-	}
-	if(j < n) {
-		ans = _newstr(j);
-		j = 0;
-		for(i = 0; i < n; i++) {
-			c = s[i];
-			if(c >= 256 || !isspace(c))
-				ans[j++] = c;
-		}
-		ans[j] = 0;
-	}
-	else
-		ans = _Strdup(s);
-	return ans;
+		return _Strdup(dflt);
+	return _fullurl(ans, base);
 }
 
 // Attribute value when mere presence of attr implies value of 1,

  reply	other threads:[~2020-12-30  3:53 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-20  9:29 umbraticus
2020-12-20 22:03 ` cinap_lenrek
2020-12-30  3:47   ` umbraticus [this message]
2020-12-31  9:42     ` umbraticus
2021-01-01  4:42       ` umbraticus
2021-01-01 10:05         ` Steve Simon
2021-01-01 19:26         ` ori
2021-01-20  2:20         ` ori
2021-01-20  2:49           ` Alex Musolino
2021-01-20  3:17           ` umbraticus
2021-01-24  5:46             ` umbraticus
2021-01-24 23:51               ` ori
2021-01-25 18:42                 ` umbraticus
2021-01-27  2:42               ` ori
2021-08-28 20:22               ` Stuart Morrow
2021-08-29  1:52                 ` umbraticus
2021-08-29  2:08                   ` ori

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=A7D190FBBAB6CFB698B84019E74DE58A@prosimetrum.com \
    --to=umbraticus@prosimetrum.com \
    --cc=9front@9front.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).