#!/usr/bin/env python3 # vim: set fileencoding=utf-8 : # Grab.py - version 04 # - grab an URL # - isolate words, count occurencies # - store everything into a dict import sys import urllib.request import re # data URLPREF = "http://" MIN_WORD_LENGHT = 3 def getWords(data): """return the list of tuples with - words contained within a HTML string - number of occurencies of this word within the string""" # remove HTML header data = re.sub(r"
.*(.*\n)*.*", "", data, flags=re.MULTILINE) # remove HTML tags data = re.sub(r"<[^<>]*>", "", data, flags=re.MULTILINE) # remove HTML entities data = re.sub(r"&[^ ;]*;", "", data) # remove special chars data = re.sub(r"[,.;:!?\"'()\[\]{}%$€@#/+*_`-]", " ", data) # isolate words words = data.split() # remove too short words and any word containing number words = filter(lambda x: len(x) > 3, words) words = list(filter(lambda x: not re.search("[0-9]",x), words)) # lower case words = list(map(lambda x: x.lower(), words)) # sort it and count occurencies words.sort() # sort *in place*, returns None ! # count occurencies # result = [(word1, n1), (word2, n2), ..., (wordN, nN)] result = [] # note. [] is also False for word in words: if result and word == result[-1][0]: # first elt. of the last tuple of result result[-1] = (word, result[-1][1] + 1) continue result.append((word, 1)) return result # 1) args management URLPREF = "http://" def usage(): print("usage: ", sys.argv[0], "