CODE
def web_crawler(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
page = tocrawl.pop()
if page not in to crawl:
content = getpage(page)
add_page_to_index(index, page, content)
union(tocrawl, get_all_links(content)
crawled.append(page)
return index
tocrawl = [seed]
crawled = []
while tocrawl:
page = tocrawl.pop()
if page not in to crawl:
content = getpage(page)
add_page_to_index(index, page, content)
union(tocrawl, get_all_links(content)
crawled.append(page)
return index
CODE
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
entry[1].append(url)
return
index.append([keyword, [url]])
def lookup(index, keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
for entry in index:
if entry[0] == keyword:
entry[1].append(url)
return
index.append([keyword, [url]])
def lookup(index, keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
just putting this up here for discussion. its in python
This post has been edited by CastleFire: Mar 23 2017, 03:13 PM