REBOL[
Title: "extract-web-links"
Version: 1.2.1
Date: 18-04-2005
Author: "Peter WA Wood"
Copyright: "Peter WA Wood"
File: %extract-web-links.r
Purpose: {A function which scans a string (normally a web page)
and creates a block of URL/Text combinations for each
HTML tag in the string.}
Usage: { example: extract-web-links read http://www.rebol.org }
Library: [
level: 'beginner
type: 'function
domain: [web html markup]
platform: 'all
tested-under: [ core 2.5.6.2.4 "Mac OS X 10.2.8"
core 2.5.6.3.1 "Windows XP Professional"
view 1.2.10.3.1 "Windows XP Professiona"]
support: none
license: cc-by
{see http://www.rebol.org/cgi-bin/cgiwrap/rebol/license-help.r}
]
]
extract-web-links: func [
{scans a string (normally a web page)
and returns a block of URL (url!) / Text (string!) combinations for
each HTML tag in the string}
web-page [string!]
"The string from which to extract web links"
/only-urls
"Only URLs are returned in the block"
/only-descriptions
"Only the supplied descriptions of the links are returned"
/local
result-block
"Block for harvesting URLs and descriptions"
collected-url
"Used to harvest individual URLs"
collected-desc
"Used to harvest individual descriptions"
end-pos
"end position of selection of web-page"
][
it: [ ; the main processing of the function
do initialisation
until [ ; end of web-page is reached
;Get the next URL in web-page
web-page: find/tail web-page " taqs too
if web-page [ ; tag found
if (not only-descriptions) [ ; URLs requested
do harvest-url ; Harvest the URL
either collected-url ; Add any URL to result block
[append result-block to-URL collected-url]
[break] ; skip to next url
]
if not only-urls [ ; Descriptions wanted ?
do harvest-desc
append result-block collected-desc
]
]
web-page = none ; test for end of web-page
] ; end until
return result-block
] ; end it
;;=======================================================================
initialisation: [
result-block: make block! [] ; initialise result block
; If both "only" refinements are set, turn them off.
; Using both refinements has the same effect as using neither.
; This allows the remainder of the code to treat the refinements as
; being mutually exclusive.
if all [only-urls only-descriptions] [
only-urls: none
only-descriptions: none
]
] ; end intialisation
;;=======================================================================
harvest-url: [ ; section to harvest URL
collected-url: copy ""
web-page: find/tail web-page "href=" ; move to char after href=
either web-page [ ; check href present
; Find start of URl
; href may be full or relative URL
; Skip opening quote if full URL
if (first web-page) = #"^""
[web-page: next web-page]
end-pos: find web-page ">" ; find end of tag
either end-pos [ ; end of tag found
collected-url: copy/part web-page end-pos
if (last collected-url) = #"^"" ; remove trailing quote
[collected-url: head remove back tail collected-url]
][ ; no closing > for tag !!
collected-url: none
]
][ ; no href !!!
collected-url: none
]
] ; end harvest-url
;;======================================================================
harvest-desc: [ ; section to harvest description
collected-desc: copy ""
web-page: find web-page ">" ; move to end of tag
if web-page [
web-page: next web-page ; move past >
if find web-page "" [ ; look for closing tag
end-pos: find web-page "" ; set end-pos at <
;check for img tag
either find/part web-page "")]][
collected-desc: find collected-desc "<"
remove remove/part collected-desc find collected-desc ">"
]
collected-desc: head collected-desc ; set index at start
] ; strip-embedded-tags
;;========================================================================
do it ; execute the function code
] ; end extract-web-links
;; History
;;
;; 1.1.0 23-Dec-2004 Initial release
;; 1.2.0 16-Jan-2004 Usage, copyright added to header
;; 1.2.1 19-Apr-2004 Tested under View 1.2.10.3.1
;;
;;