9fans - fans of the OS Plan 9 from Bell Labs
 help / color / mirror / Atom feed
From: "matt" <matt@proweb.co.uk>
To: <9fans@cse.psu.edu>
Subject: [9fans] Google File System
Date: Fri, 15 Nov 2002 23:28:28 +0000	[thread overview]
Message-ID: <027001c28cfe$b509b7d0$6501a8c0@KIKE> (raw)

[-- Attachment #1: Type: text/plain, Size: 2066 bytes --]

Money and mouth in [im]perfect harmony

attached and http://proweb.net/~matt/p9-4/goofs.c

[I had a nettiquette conundrum here, when does a post get too big to
attach?]

Here's the fs I mentioned the other day.

It started as a look into the Google API which uses SOAP
http://www.google.com/apis/

My eyes started to bleed reading that documentation so I wrote one that
parses the HTML returned from a GET

I've not used it that much but it did save my life already. We lost some
data from the database [don't ask] and I used goofs to retrieve the relevant
pages from google's cache of my site and was able to re-create the data.

I'm pretty sure it does stuff wrong but it seems to work.

to do a search echo the term into the ctl file
you can even add extra options such as site restriction
(you can get quite complicated if you try)

It will break when google changes it's html output


example
% goofs -m /usr/matt/gofs
% echo 'factotum site:www.cs.bell-labs.com' > /usr/matt/gofs/ctl

% ls /usr/matt/gofs/factotum+site:www.cs.bell-labs.com
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/2
..
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/19  [goofs tries to get 50
results]

% ls '/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1'

/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/cached
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/description
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/folder
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/related
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/summary
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/title
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/url

% cat /usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/summary

<b>Factotum</b> and SecStore</a><br><font size=-1> <b>...</b> A process
called <b>factotum</b> is used to hold credentials like passwords<br>
and public/private keypairs and perform cryptographic operations. <b>...</b>
<br>



[-- Attachment #2: goofs.c --]
[-- Type: application/octet-stream, Size: 13305 bytes --]

/*  

8c goofs.c && 8l goofs.8 && mv 8.out goofs

This is my first user level file server.
It does a google search and presents files in the mountpoint 
based on the html returned by google
to do a search echo the term into the ctl file
you can even add extra options such as site restriction
(you can get quite complicated if you try)

It will break when google changes it's html output


example
% goofs -m /usr/matt/gofs
% echo 'factotum site:www.cs.bell-labs.com' > /usr/matt/gofs/ctl 

% ls /usr/matt/gofs/factotum+site:www.cs.bell-labs.com
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/2
..
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/19  [goofs tries to get 50 results]

% ls '/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1' 

/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/cached
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/description
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/folder
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/related
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/summary
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/title
/usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/url

% cat /usr/matt/gofs/factotum+site:www.cs.bell-labs.com/1/summary

<b>Factotum</b> and SecStore</a><br><font size=-1> <b>...</b> A process called <b>factotum</b> is used to hold credentials like passwords<br>
and public/private keypairs and perform cryptographic operations. <b>...</b> 
<br>


*/

#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ndb.h>
#include <fcall.h>
#include <thread.h>
#include <9p.h>


Tree *tree=nil;
File *ctrlfile=nil;

enum {
	id_Ctl=1,
	id_Qr=2,
};

typedef struct Aux Aux;
struct Aux {
	int id_code;
	int index;
	Aux *next;
	char *data;
	int datasize;
};

typedef struct QResult QResult;
struct QResult {
	char *search;
	char *url;
	char *title;
	char *summary;
	char *cached;
	char *description;
	char *related;
	char *folder;
};

int strn_tr(char *subject, int count, char c_before, char c_after) {
	int i, k=0;
	if (subject && count > 0) 
		for(i=0; i<count; i++) 
			if (subject[i] == c_before) {
				subject[i] = c_after;
				k++;
			}
	return k;
}

int str_tr(char *subject, char c_before, char c_after) {
	if (subject)
		return strn_tr(subject, strlen(subject), c_before, c_after);
	return 0;
}


Aux *aux_list=nil;
File * create_new_file(File *d, char* filename, int perm, int id_code, char *data, int datasize) ;
char * set_file_data(File *f, char *data, int datasize);
Aux * get_aux(int);

File *
create_new_file(File *dir, char* filename, int perm, int id_code, char *data, int datasize) {
	File *new_file=nil;
	Aux *new_aux;

	if (dir == nil || filename == nil) return nil;

	incref(dir);	/* so walk doesn't kill it immediately on failure */

	if (new_file = walkfile(dir, filename)) {
		new_file = nil;
	} else {
		new_aux = get_aux(0);
		new_aux->id_code = id_code;
		new_file = createfile(dir, filename, getuser(), perm, (void*)new_aux->index);
		set_file_data(new_file, data, datasize);
		incref(dir);
	}

	decref(dir);
	
	return new_file;
}

File *
get_or_create_new_file(File *dir, char* filename, int perm, int id_code, char *data, int datasize) {
	File *new_file=nil;
	Aux *new_aux;

	if (dir == nil || filename == nil) return nil;
	incref(dir);
	incref(dir);	/* so walk doesn't kill it immediately on failure */

	if (! (new_file = walkfile(dir, filename)) ) {
		new_aux = get_aux(0);
		new_aux->id_code = id_code;
		new_file = createfile(dir, filename, getuser(), perm, (void*)new_aux->index);
		set_file_data(new_file, data, datasize);
		
	}

	decref(dir);
	
	return new_file;
}

Aux *
get_aux(int index) {
	Aux *a;
	if(index) {
		for(a=aux_list; a && a->index!=index ; a=a->next) ;
	} else {
		a = emalloc9p(sizeof(Aux));
		a->id_code = 0;
		a->data = nil;
		a->datasize = 0;

		if(aux_list) {
			a->index = aux_list->index + 1;
			a->next = aux_list;
		} else {
			a->index = 1;
			a->next = nil;
		}

		aux_list = a;
	}

	return a;
};

void
free_aux(Aux *a) {
	if (!a) return;
	free(a->data);
}

int
set_aux_data(Aux *a, char * data, int datasize) {
	
	if(a) free(a->data);

	if (data) {
		datasize = datasize + 1;
		a->data = emalloc9p(datasize);
		a->datasize = datasize;
		memcpy(a->data, data, datasize-1);
		a->data[datasize-1] = 0;
	} else {
		a->data = nil;
		a->datasize = 0;
	}

	return a->datasize;
}

char *
set_file_data(File *f, char *data, int datasize) {
	Aux *a=nil;
	if (f && (a = get_aux((int)f->aux))) {
		f->length = set_aux_data(a, data, datasize);
		f->aux = (void*)a->index;
		return a->data;		
	}
	return nil;
}

void
fsopen(Req *r) { 
	int i;
	i = (int)r->fid->file->aux;
	respond(r, nil);
}

char *
str_append(char *target, char *source) {
	int new_size=0;
	if (source) {
		if (target) {
			new_size = strlen(target) + strlen(source) + 1;
			target = erealloc9p(target, new_size );
			target = strcat(target, source);
		} else {
			target = estrdup9p(source);
		}
	}
	return target;
}

QResult *
new_query_result(char *search) {
	QResult *qr = emalloc9p(sizeof(QResult));
	qr->search = estrdup9p(search);
	qr->url = mallocz(1, 1); 
	qr->summary = mallocz(1, 1); 
	qr->cached = mallocz(1, 1);
	qr->description = mallocz(1, 1);
	qr->related=mallocz(1, 1);
	qr->folder=mallocz(1,1);
	qr->title = mallocz(1,1);
	return qr;
}
		
QResult *
fill_query_result(QResult *qr, Biobuf *bio_in) {
	char *start_c=nil;
	int siz;
	char block;
	int end_block=0;
	int chop=0;
	char token;
	char *s;

	if(!qr) return nil;

	start_c = Brdstr(bio_in, '=', 0);
	if(start_c) {
		free(start_c);  // just consume it
		start_c = nil;
	}

	chop=1;

	qr->url = str_append(qr->url, Brdstr(bio_in, '>', chop));
	s = strchr(qr->url, '/');
	if(s) qr->related = smprint("q=related:%s", s + 1);
	token = '>';
	block = 's';
	chop =0;
	while(token && (start_c=Brdstr(bio_in, token, chop))) {
		siz = Blinelen(bio_in) ;

		if(siz > 28) {
			if ((block == 'f') && strcmp(start_c + (siz - 29), "</span><a class=fl href=http:") == 0) {
				token = '/';
				chop = 0;
				end_block = 0;
				goto next_block;
			}
		}
		if (siz > 19) {
			if (strcmp(start_c + (siz-20), "<font color=#008000>") == 0) {
				start_c[siz-20] = 0;
				switch(block) {
				case 's' :
					qr->summary = str_append(qr->summary, start_c);
					break;
				case 'd' :
					qr->description = str_append(qr->description, start_c);
					break;
				case 'f' :
					qr->folder = str_append(qr->folder, start_c);
					break;
				}
				end_block = 1;
				block = '>';
				token = ':';
				chop=0;
				goto next_block;
			}
		}

		if (siz > 15) {
			if (strcmp(start_c + (siz-16), "/search?q=cache:") == 0) {
				qr->cached = str_append(qr->cached, "http:");
				qr->cached = str_append(qr->cached, start_c);
				block = 'c';
				token = '>';
				chop=1;
				
				end_block=0;
				goto next_block;
			}
		} 
		if (siz > 13) {
			if (strcmp(start_c + (siz-14), "<span class=f>") == 0) {
				end_block=0;
				token = ':';
				chop=0;
				block = '>';
				goto next_block;
			}
		}

		if (siz > 11) {
			if (strcmp(start_c + (siz-12), "Description:") == 0) {
				block ='d';
				end_block=0;
				token = '>';
				chop=0;
				goto next_block;
			}
		}
		if (siz > 8) {
			if (strcmp(start_c + (siz-9), "Category:") == 0) {
				block ='f';
				end_block=0;
				token = ':';
				chop=0;
				goto next_block;
			}
		}
		if (siz > 7) {
			if (strcmp("<!--n-->", start_c +(siz-8)) == 0) {
				token = 0;
				goto next_block; 
			}
		}

		if (strcmp("</span>", start_c) == 0)
				if (block == 'd' ) goto next_block;
		if (strcmp(" </span>", start_c) == 0)
				if (block == 'f' ) goto next_block;
		if (strcmp("</font>", start_c) == 0)
				if (block == 'd') goto next_block;
		if(strcmp("<span class=f>", start_c) == 0)
				if (block == 'd') goto next_block;
		if(strcmp("/", start_c) == 0)
				if (block =='f') {
					token = '>';
					chop = 1;
					goto next_block;
				}

		if(!end_block) {
			switch (block) {
			case 's' :
				qr->summary = str_append(qr->summary, start_c);
				break;
			case 'c' :
				qr->cached = str_append(qr->cached, start_c);
				end_block=1;
				chop =0;
				break;
			case 'd':
				qr->description = str_append(qr->description, start_c);
				break;
			case 'f':
				qr->folder = str_append(qr->folder, start_c);
				end_block = 1;
				token = '>';
				chop = 0;
				break;
			case '>' :
				// skip
				break;
			}
			goto next_block;
		}
	next_block :
		free(start_c);
		start_c = nil;
	}

	free(start_c);

	return qr;
}	
	

char*
url_encode(char *string) {
	return string;
}
void
free_qresult(QResult *qr) {
	if (qr == nil) return;

	free(qr->url);
	free(qr->summary);
	free(qr->search);
	free(qr->cached);
	free(qr->description);
	free(qr->related);
	free(qr->folder);
	free(qr->title);

}

File *
create_fs(File *qr_root, QResult *qr, int qnum) {
	File *qr_dir;
	char *qnumtxt;
	if(qr_root == nil || qr == nil) return;

	qnumtxt = smprint("%d", qnum);
	if (qr_dir=create_new_file(qr_root, qnumtxt, DMDIR|0777, 0, nil, 0)) {
		create_new_file(qr_dir, "url", 0444, id_Qr, qr->url, strlen(qr->url));
		create_new_file(qr_dir, "title", 0444, id_Qr, qr->title, strlen(qr->title));
		create_new_file(qr_dir, "summary", 0444, id_Qr, qr->summary, strlen(qr->summary));
		create_new_file(qr_dir, "cached", 0444, id_Qr, qr->cached, strlen(qr->cached));
		create_new_file(qr_dir, "description", 0444, id_Qr, qr->description, strlen(qr->description));
		create_new_file(qr_dir, "related", 0444, id_Qr, qr->related, strlen(qr->related));
		create_new_file(qr_dir, "folder", 0444, id_Qr, qr->folder, strlen(qr->folder));

	}
	free(qnumtxt);
	return nil;
}

char *
process_query(char *search_item, int start_q, int num_q) {
	int gfd;
	int not_finished;
	int num_results;
	Biobuf *gbio_in;
	char *html = nil;
	char *post=nil;
	int siz;
	QResult *qr;
	File *qr_root;

	if (!search_item) {
		return nil;
	}

	if ( (gfd = dial("tcp!www.google.co.uk!80", 0, 0, 0)) > 1) {
		gbio_in = emalloc9p(sizeof(Biobuf));
		Binit(gbio_in, gfd, OREAD);
		post = smprint("GET http://www.google.com/search?q=%s&sourceid=mozilla-search&start=%d&num=%d&as_oq=&as_eq=&lr=&as_ft=i&as_filetype=&as_qdr=all&as_occt=any&as_dt=i&safe=images HTTP/1.0\nUser-Agent: Mozilla\5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.0.0) Gecko\20020530\n\n", url_encode(search_item), start_q, num_q);  
		fprint(gfd, post);
		free(post);
		not_finished = 1;
		num_results = 0;
		
		if(! (qr_root = get_or_create_new_file(tree->root, search_item, DMDIR|0777, 0, nil, 0))) return nil;
		
		while(not_finished && (html = Brdstr(gbio_in, '>', 0))) {
			siz = Blinelen(gbio_in);

			if (siz > 7) {
				if ( strcmp("<!--m-->", html + siz-8) == 0) {
					num_results++;
					qr = new_query_result(search_item) ;
					if (fill_query_result(qr, gbio_in)); 
						create_fs(qr_root, qr, ++start_q);
					free_qresult(qr);
					
				} else {
					not_finished = strcmp("<!--z-->", html + siz-8);
				}

			}
		
			free(html);
		}
	}
	return nil;
}

char *
process_ctl_message(Aux *a) {
	char *reply = nil;
	char *query=nil;
	char *lastchar;
	char *firstchar;
	if(!(a && a->data)) return nil;

	if (a->data) {
		query = estrdup9p(a->data);
		str_tr(query, '\n', ' ');
		for(firstchar=query; *firstchar== ' '; firstchar++);
		for(lastchar = query +strlen(query)-1; *lastchar == ' '; lastchar--) *lastchar = 0;
		str_tr(firstchar, ' ', '+');
		reply = process_query(firstchar, 0, 50);
		free(query);
	}

	return reply;
}

void
fswrite(Req *r) {
	int index= (int)r->fid->file->aux;
	Aux *a;
	char * errstr = nil;
	
	if(index) {
		a = get_aux(index);
		if (a && set_aux_data(a, r->ifcall.data, r->ifcall.count) ) {
			r->ofcall.offset = a->datasize;
			if (a->id_code == id_Ctl)
				errstr = process_ctl_message(a);
		}
	}
	respond(r, errstr);
	free(errstr);
}


void
fsread(Req *r)
{
	int index;
	Aux *a=nil;
	index= (int)r->fid->file->aux;

	a = get_aux(index);

	if (a && a->data && r->ifcall.offset < a->datasize) {
			r->ofcall.data = a->data + r->ifcall.offset;
			r->ofcall.count = a->datasize - r->ifcall.offset ;
	} else {
		r->ofcall.data = nil;
		r->ofcall.count = 0;
	}
	respond(r, nil);
}

void
fsend (Srv *) {
	Aux *a;
	while (aux_list) {
		a = aux_list->next;
		free_aux(aux_list);
		aux_list = a;
	}
}

Srv fs = 
{
.open=		fsopen,






static void
usage(void)
{
	fprint(2, "usage: goofs [-m mtpt] \n");
	exits("usage");
}
	


void
main(int argc, char **argv)
{
	Aux *new_aux;	
	char *mtpt;
	
	mtpt = "/mnt/goofs";

	ARGBEGIN{
	case 'm':
		mtpt = ARGF();
		break;
	}ARGEND;
	if(argc != 0)
		usage();
	
	tree = fs.tree = alloctree(getuser(), getuser(), DMDIR|0555, nil);
	ctrlfile = create_new_file(tree->root, "ctl", 0666, id_Ctl, nil, 0) ;

	postmountsrv(&fs, nil, mtpt, MREPL);

	exits(nil);

}


             reply	other threads:[~2002-11-15 23:28 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-11-15 23:28 matt [this message]
2002-11-16 17:11 mark powers
2002-11-16 20:10 markp

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='027001c28cfe$b509b7d0$6501a8c0@KIKE' \
    --to=matt@proweb.co.uk \
    --cc=9fans@cse.psu.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).