Skip to content

Commit c33c188

Browse files
authored
+ Webcrawler sample from AsyncSeq post (#490)
1 parent b392d8c commit c33c188

File tree

1 file changed

+75
-1
lines changed

1 file changed

+75
-1
lines changed

docsrc/content/type-seqt.fsx

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,80 @@ let printPages =
8686

8787
printPages |> Async.Start
8888

89+
8990
(**
90-
To make it work with tasks simply add `|> Async.StartAsTask` between `wc.AsyncDownloadString (Uri url)` and `|> SeqT.lift` then run eveything but the `printPages |> Async.Start`.
91+
These samples above and below come from the [original AsyncSeq post](http://tomasp.net/blog/async-sequences.aspx) and they can be easily switched to task sequeces (taskSeq), simply add `|> Async.StartAsTask` between `wc.AsyncDownloadString (Uri url)` and `|> SeqT.lift` then run eveything but the `printPages |> Async.Start`.
9192
*)
93+
94+
// A simple webcrawler
95+
96+
#r "nuget: FSharpPlus"
97+
#r "nuget: HtmlAgilityPack"
98+
99+
open System
100+
open System.Net
101+
open System.Text.RegularExpressions
102+
open HtmlAgilityPack
103+
open FSharp.Control
104+
105+
open FSharpPlus
106+
open FSharpPlus.Data
107+
108+
// ----------------------------------------------------------------------------
109+
// Helper functions for downloading documents, extracting links etc.
110+
111+
/// Asynchronously download the document and parse the HTML
112+
let downloadDocument url = async {
113+
try let wc = new WebClient ()
114+
let! html = wc.AsyncDownloadString (Uri url)
115+
let doc = new HtmlDocument ()
116+
doc.LoadHtml html
117+
return Some doc
118+
with _ -> return None }
119+
120+
/// Extract all links from the document that start with "http://"
121+
let extractLinks (doc:HtmlDocument) =
122+
try
123+
[ for a in doc.DocumentNode.SelectNodes ("//a") do
124+
if a.Attributes.Contains "href" then
125+
let href = a.Attributes.["href"].Value
126+
if href.StartsWith "https://" then
127+
let endl = href.IndexOf '?'
128+
yield if endl > 0 then href.Substring(0, endl) else href ]
129+
with _ -> []
130+
131+
/// Extract the <title> of the web page
132+
let getTitle (doc: HtmlDocument) =
133+
let title = doc.DocumentNode.SelectSingleNode "//title"
134+
if title <> null then title.InnerText.Trim () else "Untitled"
135+
136+
// ----------------------------------------------------------------------------
137+
// Basic crawling - crawl web pages and follow just one link from every page
138+
139+
/// Crawl the internet starting from the specified page
140+
/// From each page follow the first not-yet-visited page
141+
let rec randomCrawl url =
142+
let visited = new System.Collections.Generic.HashSet<_> ()
143+
144+
// Visits page and then recursively visits all referenced pages
145+
let rec loop url = monad.plus {
146+
if visited.Add(url) then
147+
let! doc = downloadDocument url |> SeqT.lift
148+
match doc with
149+
| Some doc ->
150+
// Yield url and title as the next element
151+
yield url, getTitle doc
152+
// For every link, yield all referenced pages too
153+
for link in extractLinks doc do
154+
yield! loop link
155+
| _ -> () }
156+
loop url
157+
158+
// Use SeqT combinators to print the titles of the first 10
159+
// web sites that are from other domains than en.wikipedia.org
160+
randomCrawl "https://en.wikipedia.org/wiki/Main_Page"
161+
|> SeqT.filter (fun (url, title) -> url.Contains "en.wikipedia.org" |> not)
162+
|> SeqT.map snd
163+
|> SeqT.take 10
164+
|> SeqT.iter (printfn "%s")
165+
|> Async.Start

0 commit comments

Comments
 (0)