Automating the Web with WebL
by Hannes Marais and Tom Rodeheffer


Example 1:
(a)
page = GetURL("http://www.digital.com")
(b)
page = GetURL(
     "http://www.altavista.digital.com/cgi-bin/query",
     [. pg="q", what="web", q="java" .])
(c)
page = GetURL("http://www.altavista.digital.com/") ?
     GetURL("http://www.altavista.yellowpages.com.au")
(d)
page = GetURL("http://www.altavista.digital.com/") |
     GetURL("http://www.altavista.yellowpages.com.au")
(e)
page = Timeout(10000,
     GetURL("http://www.altavista.digital.com/") |
     GetURL("http://www.altavista.yellowpages.com.au"))
(f)
page = Retry(
     GetURL("http://www.altavista.digital.com/") ?
     GetURL("http://www.altavista.yellowpages.com.au"))
(g)
page = Retry(
     GetURL("http://www.digital.com") ?
     Timeout(10000,Stall()))


Example 2:
(a)
Elem(page,"h1") + Elem(page,"h2")
(b)
Pat(page,"(W|w)eb") before Elem(page,"h1")[3]
(c)
Elem(page,"a") directlyafter Pat(page,"\bCITE\b")
(d)
Elem(page,"a") contain Elem(page,"img")
(e)
var tbls = Elem(page,"table");       // all tables
var sqs = Pat(page,"Stock Quotes");  // all s.q. texts
var sqts = tbls directlycontain sqs; // innermost s.q. tables
var sqts1 = sqts after sqts[0];      // except the first
Elem(page,"tr") inside sqtsl         // rows of those tables

 

Listing One
<html><head><title>WebL Test Page</title></head>
<body>
<img src="pix/webllogo.gif"><p>
<font size=+2>W<b>e</b>bL</font> (pronounced "webble")
is a <font size=+1>scripting language</font> for automating tasks on the
World-Wide Web.
It is an imperative, interpreted language that has built-in support
for common web protocols like HTTP and FTP, and
popular data types like HTML and XML.
<p>
<a name="features">WebL has two special features.</a>
<p>
<b>Service combinators</b> provide an <i>exception-handling mechanism</i>
that makes computations on the web more reliable.
<p>
A <b>markup algebra</b> provides a way
to <i>extract <b>and</b> manipulate data</i> in web pages.
<p>
These features make it easy to implement tools like web shopping robots,
meta-search engines, HTML analysis and checking routines, and so on.
<p>
WebL's implementation language is Java, and the complete source code is
<font size=+1><a href="http://www.research.digital.com/SRC/WebL/index.html">
freely available</a></font>.
Extensions (in the form of modules) are easy to add by Java programmers.
<p>
<a href=http://www.compaq.com><img src="pix/cpqlogo.gif"></a>
</body>
</html>


Listing Two
1 var page = GetURL("file:demo.html");   // fetch the demo page
2 
3 
4 
5 var links = Elem(page,"a");         // extract anchor elements
6 
7 // loop for each piece, same order as in document
8 
9 every link in links do
10     PrintLn(link.href) ? nil        // print its href attribute
11 
12     // link.href fails if piece has no href attribute,
13     // service combinator handles failure with a secondary
14     // execution of nil (which does nothing)
15 end;
16 
17 // since piece-sets are ordered, the individual pieces can be
18 // extracted by applying a numeric index
19 
20 PrintLn("name = ",links[0].name);   // name attribute of first anchor
21 
22 
23 
24 var words = Pat(page,"(W|w)eb");    // extract text occurrences
25 
26 // loop for each piece, same order as in document
27 
28 every word in words do
29     Print(Markup(word)," ")         // print word including markup
30 end;
31 
32 PrintLn("= ", Size(words), " times.");    // how many words
33 
34 
35 
36 // extract sequences of <bold element> text <italic element>
37 // each resulting piece is such a sequence
38 // loop for each piece, same order as in document
39 
40 every item in Seq(page,"b # i") do
41 
42     // by applying a numeric index, we get the component that
43     // matched the respective specifier of the search sequence
44 
45     // print the bold element and the italic element,
46     // just text, ignoring markup
47 
48     PrintLn(Text(item[0])," -- ",Text(item[2]))
49 end;


Listing Three
http://www.research.digital.com/SRC/WebL/index.html
http://www.compaq.com/
name = features
Web W<b>e</b>b web Web web Web web web web Web = 10 times.
Service combinators -- exception-handling mechanism
markup algebra -- extract and manipulate data

Listing Four
1 import Str;
2 
3 var QueryAltaVista = fun(query)
4   var results = [];               // initially empty result list
5   var page = GetURL(              // how to query AltaVista
6     "http://www.altavista.digital.com/cgi-bin/query",
7     [. pg="q", q=query .]);
8   every i in Seq(page,"b a br # font br #") do  // find the answers
9     results = results +          // concatenate this answer
10       [[.                        // make an object with these fields
11         title = Text(i[1]),
12         href = i[1].href,
13         abstract = Str_Trim(Text(i[6]))
14       .]]
15   end;
16   results
17 end;
18 
19 var QueryHotBot = fun(query)
20   var results = [];              // initially empty result list
21   var page = GetURL(             // how to query HotBot
22     "http://www.hotbot.com/default.asp",
23     [. MT=query .]);
24   every i in Seq(page,"b br # br") do     // find the answers
25     // Elem can search inside a piece too
26     // URL is the 2nd anchor in the 1st component 
27     var a = Elem(i[0],"a")[1];  
28     results = results +          // concatenate this answer
29       [[.                        // make an object with these fields
30         title = Text(a),
31         href = a.href,
32         abstract = Text(i[2])
33       .]]
34   end;
35   results
36 end;
37 
38 var q = ARGS[1];                 // get query word from command line
39 var results = QueryAltaVista(q) + QueryHotBot(q);
40 PrintLn("Results for ",q,":");
41 every r in results do            // print each result
42   PrintLn();
43   PrintLn("    ",r.title);
44   PrintLn("    ",r.href);
45   PrintLn(r.abstract);
46 end

Listing Five
1 import Url;
2 import WebServer;
3 
4 var port = 9092;
5 var where = "/bin/highlight";
6 
7 
8 var Highlight = fun(req,res)
9   // access the url and word parameters from the request
10   // missing parameters cause the field access to fail
11   // default values supplied using a service combinator
12 
13   var url = req.param.url ? "http://www.compaq.com";
14   var word = req.param.word ? "Compaq";
15 
16   PrintLn("url=",url," word=",word);   // log it on the console
17 
18   var page = GetURL(url);              // fetch the page
19 
20   // for each matching text not inside the title 
21   every w in Pat(page,word) !inside Elem(page,"title") do
22     var p = NewNamedPiece("font",w);   // wrap a font element around it
23     p.size := "+1";                    // define its size attribute
24     p.color := "red";                  // define its color attribute
25   end;
26 
27   every a in Elem(page,"a") do         // for each anchor
28     a.href = where +                   // rewrite its href to be me
29              "?word=" + Url_Encode(word) +    // word parameter
30              "&url=" + Url_Encode(a.href)     // url parameter
31       ? nil;                           // but do nothing if no href
32   end;
33 
34   res.result = Markup(page);           // this is the result
35 end;
36 
37 
38 WebServer_Publish(where,Highlight);    // associate url with function
39 WebServer_Start("/dev/null",port);     // disk pages root, server port
40 
41 // when a browser tries to fetch a "published" page,
42 // web server task calls back to our function to handle the request
43 
44 PrintLn("Highlight Server running. Contact :",port,where);
45 
46 Stall()                                // server task runs in background


5


