This notebook will be available here :
https://wiki.student.info.ucl.ac.be/Documentation/Python
... and on Moodle
In order to use this notebook as slides with the possiblity to execute Python code, we need
Of course, every slide MUST be labelled with the Slides, Subslides, and so on attributes.
To execute the notebook : $ ipython notebook grab_a_python_tutorial.ipynb
dynamicaly Interpreted
python code (.py) -> bytecode (.pyc) -> execution
Python language
Some tools
#!/usr/bin/env python3
# coding: utf-8
print("Hello world!")
print("Hello AI students!")
print("test testing test")
allow execution:
chmod +x myfile.py
execute:
./myfile.py
python3 myfile.py
The Grab project : build a web indexer
#!/usr/bin/env python3
# vim: set fileencoding=utf-8 :
#
# Grab.py - version 01 - check args
# 1) args management
URLPREF = "http://"
import sys
def usage():
print("usage:", sys.argv[0], "<url to parse>")
sys.exit(1)
if not (len(sys.argv) == 2 and sys.argv[1].startswith(URLPREF)) :
usage()
url = sys.argv[1]
print("You ask to grab", url)
# defining variables <- this is a comment
a = 1
b = 2
c = 'string'
print("a+b=", a + b, "\na-b=", a - b)
print("a*b=", a * b, "\na/b=", a / b, "\nb^b=", b ** b)
d = 0.1
e = 3.4 + 2j
print("a is a", type(a), "\nb is a", type(b), "\nc is a", type(c))
print("d is a", type(d), "\ne is a", type(e))
# variables have no fixed type
b = 1
print(b)
b = "another string"
print(b)
# string are iterable
print(b)
print(b[0])
print(b[1])
print("Positive indexes", b[0], b[1], b[2])
print("Negative indexes", b[-1], b[-2], b[-3])
print(len(b))
# take a slice of string
print(b)
print(b[0:2]) # first 2 chars
print(b[0:len(b)-1]) # all but the last
print(b[0:-1]) # all but the last (2nd version)
print(b[:-1]) # all but the last (3rd version)
print(b[1:]) # all but the first
print(b[:]) # a copy of b
# strings are *unmutable*
b[1] = "c"
# playing with string : + and *
b = "Spam"
print("b+b=", b + b, "\n" + "b*3=", b*3)
print("Egg, " + (b + ", ") * 4 + b + " and Egg")
print(dir(b))
help(b.startswith)
# calling an instance method
b = "Spam"
print(b, b.startswith("Sp"))
b = "Egg"
print(b, b.startswith("Sp"))
spacious = " " + b + " \n\n "
spacious.strip()
b = "Spam"
b.split('a')
b.upper()
b.lower()
# import modules elements into your program
import sys
help(sys)
dir(sys)
type(sys.argv)
l = [1, 2, "a", 3]
ll = [4.5, l, 6.7]
print(l)
print(ll)
print(l[0])
print(l[-1])
print(ll[1])
print("l \t= ", l)
print("l[0:2] \t= ", l[0:2]) # first 2 chars
print("l[:-1] \t= ", l[:-1]) # all but the last (2nd version)
print("l[1:] \t= ", l[1:]) # all but the first
print("l[:] \t= ", l[:]) # a copy of l
print("l = ", l)
print("ll = ", ll)
l[0] = 42
l[0] += 1
print(l)
print("l = ", l)
print("ll = ", ll)
ll[1][0] = 0.0 # double index
print(ll)
print(l)
print("l = ", l)
# even slice assignment
l[0:2] = ["A", "B", "C"]
print(l)
# append at the end
l.append("new element")
l
# pop at index (default -1)
l.pop(1)
l
# insert at index
l.insert(1, "another element")
print(l)
dir(l)
b = "Holy"
if b == "Spam": # line ending with : -> requires an indented block
pass # do nothing, in block statement
print("OK") # next in block statement
elif b == "Egg":
print("KO")
elif b == "Holy":
print("The", b, "Grail")
else:
print("wtf?")
print("Out of the IF") # out of block statement
# boolean operators : and, or, not
b = "Spam"
if b.startswith("Sp") and b.endswith("am"):
print("Spam" + b)
def myfunction(arg1, arg2, arg3): # line ending with : -> requires an indented block
""" this is a docstring, meant
to give information on my function """
print("This is myfunction")
print("Args:", arg1, arg2, arg3)
# call myfunction
ret = myfunction(1, 2, 3)
print(ret)
help(myfunction)
def my2ndfunction(arg1, arg2, arg3): # : => indented block
""" this is a docstring """
print("This is my2ndfunction" +
"test")
return "Args: " + str(arg1) + ", " \
+ str(arg2) + ", " + str(arg3)
ret = my2ndfunction(1, 2, 3)
print(type(ret))
print("ret = ", ret)
#!/usr/bin/env python3
# vim: set fileencoding=utf-8 :
#
# Grab.py - version 02 - grab an URL
# 1) args management
import sys
URLPREF = "http://"
def usage():
print("usage: ", sys.argv[0], "<url to parse>")
sys.exit(1)
if not (len(sys.argv) == 2 and sys.argv[1].startswith(URLPREF)) :
usage()
url = sys.argv[1]
print("You ask to grab", url)
# 2) retreive the URL
import urllib.request
data = urllib.request.urlopen(url)
print("Here it is:")
for line in data:
print(line.decode("utf-8"), end="")
string = "abcdef"
for letter in string: # line ending with : -> indented block
print(letter, end=" ")
alist = ["Lancelot", "Galaad", "Bedivere", "Arthur", "Robin"]
for knight in alist: # line ending with : -> indented block
title = "Sir"
if knight == "Arthur": # line ending with : -> indented block
title = "King"
print(title, knight)
list(enumerate(alist))
for i, knight in enumerate(alist):
if i == len(alist) - 1:
print("And the last: ", end="")
print(i, knight)
list(range(5))
for i in range(5):
print(i)
a = ['Mary', 'had', 'a', 'little', 'lamb']
for i in range(len(a)): # Don't do this, this is NOT pythonic! Use enumerate if you want this ouput
print(i, a[i])
range(5) # range returns an *iterator*
list(range(3, 10))
list(range(2, 10, 2))
list(range(-10, -100, -30))
filename = "myfile.txt"
f = open(filename, "w") # mode = "r", "w", "r+"
# and that can be combined
# with "b" for binary files
lines = ["First line", "Second line", "Last line"]
for line in lines:
f.write(line + "\n")
f.close()
import os.path
os.path.isfile(filename)
ff = open(filename, "r")
for line in ff:
print(line, end='')
ff.close()
ff = open(filename, "r")
print(ff.read())
ff.close()
ff = open(filename, "r")
ff.readline()
ff.readlines()
ff.close()
import urllib.request
f = urllib.request.urlopen("http://localhost:8000") # grab the content, return file object
f.readline()
f.readline()
# this is a byte array
f.readline().decode("utf-8") # to produce usual string
for line in f:
print(line.decode("utf-8"), end="")
#!/usr/bin/env python3
# vim: set fileencoding=utf-8 :
#
# Grab.py - version 03 - grab an URL, isolates words
# 1) args management
import sys
URLPREF = "http://"
def usage():
print("usage: ", sys.argv[0], "<url to parse>")
sys.exit(1)
if not (len(sys.argv) == 2 and sys.argv[1].startswith(URLPREF)) :
usage()
url = sys.argv[1]
print("You ask to grab", url)
# 2) retreive the URL
import urllib.request
raw_data = urllib.request.urlopen(url)
# 3) Parse it
import re
data = raw_data.read().decode("utf-8")
print(data)
print("Raw data\n" + "="*80)
input()
# 3.1) remove HTML header
data = re.sub(r"<head>.*(.*\n)*.*</head>", "", data,flags=re.MULTILINE)
print(data)
print("No HTML Header\n" + "="*80)
input()
# 3.2) remove HTML tags
data = re.sub(r"<[^<>]*>", "", data, flags=re.MULTILINE)
print(data)
print("No HTML Tag\n" + "="*80)
input()
# 3.3) remove HTML entities like
data = re.sub(r"&[^ ;]*;", "", data)
print(data)
print("No HTML entity\n" + "="*80)
input()
# 3.4) remove unwanted chars
data = re.sub(r"[,.;:!?\"'()\[\]{}%$€@#/+*_`-]", " ", data)
print(data)
print("No unwanted chars\n" + "="*80)
input()
# 3.5) isolate words
words = data.split()
print(words)
print("\nThe words\n" + "="*80)
input()
# 3.6) remove too short words and any word containing number
words = filter(lambda x: len(x) > 3, words)
words = list(filter(lambda x: not re.search("[0-9]",x), words))
print(words)
print("\nThe words with no digit and more than 3 chars\n" + "="*80)
input()
# 3.7) lower case
words = list(map(lambda x: x.lower(), words))
print(words)
print("\nAll in lowercase\n" + "="*80)
input()
# 3.8) sort it and remove duplicate instances
words = list(set(words))
words.sort() # sort *in place*, returns None !
print(words)
print("\nSorted words, with no duplicate\n" + "="*80)
import re
re.findall(r'\bf[a-z]*', 'which foot or hand fell fastest')
re.sub(r'(\b[a-z]+) \1', r'\1', 'cat in the the hat')
re.split("(b|e)", "abcdef")
re.match("c", "abcdef") # No match
re.search("c", "abcdef")
f = lambda x: x+42 # lambda forms are unamed functions
f
f(5)
def make_incrementor(inc): # a function that returns functions!
return lambda x: x + inc
f = make_incrementor(42)
f
f(5)
ff = make_incrementor(5)
ff(5)
ff = f # ff is now an alias of f
ff(5)
filter(function, sequence) apply a filter to a list i.e. returns the list of elements of sequence for wich the function filter return True.
def is_even(x):
return x % 2 == 0
print(list(range(10)))
print(list(filter(is_even, range(10))))
list(filter(lambda x: x%2 != 0, range(10)))
map(function, sequence) applies function(item) for each item in the sequence and returns a list of the return values.
def cube(x): return x*x*x
list(map(cube, range(1, 11)))
list(map(lambda x: x**3, range(1, 11)))
seq = range(8)
def add(x, y): return x+y
list(map(add, seq, seq))
list(map(lambda x, y: x + y, seq, seq))
basket = ["apple", "orange", "apple", "pear", "orange", "banana"]
fruit = set(basket) # create a set without duplicates
fruit
"orange" in fruit
"salad" in fruit
a = set("abracadabra")
b = set("alacazam")
print("a \tunique letters in abracadabra: \t", a)
print("a - b \tletters in a but not in b: \t", a - b)
print("a | b \tletters in either a or b: \t", a | b)
print("a & b \tletters in a and b: \t\t", a & b)
print("a ^ b \tletters in a or b but not both: ", a ^ b)
#!/usr/bin/env python3
# vim: set fileencoding=utf-8 :
# Grab.py - version 04
# - grab an URL
# - isolate words, count occurencies
# - store everything into a dict
import sys
import urllib.request
import re
# data
URLPREF = "http://"
MIN_WORD_LENGHT = 3
def getWords(data):
"""return the list of tuples with
- words contained within a HTML string
- number of occurencies of this word within the string"""
# remove HTML header
data = re.sub(r"<head>.*(.*\n)*.*</head>", "", data, flags=re.MULTILINE)
# remove HTML tags
data = re.sub(r"<[^<>]*>", "", data, flags=re.MULTILINE)
# remove HTML entities
data = re.sub(r"&[^ ;]*;", "", data)
# remove special chars
data = re.sub(r"[,.;:!?\"'()\[\]{}%$€@#/+*_`-]", " ", data)
# isolate words
words = data.split()
# remove too short words and any word containing number
words = filter(lambda x: len(x) > 3, words)
words = list(filter(lambda x: not re.search("[0-9]",x), words))
# lower case
words = list(map(lambda x: x.lower(), words))
# sort it and count occurencies
words.sort() # sort *in place*, returns None !
# count occurencies
# result = [(word1, n1), (word2, n2), ..., (wordN, nN)]
result = [] # note. [] is also False
for word in words:
if result and word == result[-1][0]: # first elt. of the last tuple of result
result[-1] = (word, result[-1][1] + 1)
continue
result.append((word, 1))
return result
# 1) args management
URLPREF = "http://"
def usage():
print("usage: ", sys.argv[0], "<url to parse>")
sys.exit(1)
if not (len(sys.argv) == 2 and sys.argv[1].startswith(URLPREF)) :
usage()
url = sys.argv[1]
print("You ask to grab", url)
# 2) retreive the URL
raw_data = urllib.request.urlopen(url)
data = raw_data.read().decode("utf-8")
# 3) isolate words and store them
# The idea is to produce a structure that can contain the result of
# many URL analysis: use a dictionary
# Dict structure:
# key -> value
# --- -----
#
# words -> [(nmbr1, url1), (nmbr2, url2), ...]
#
storage = {}
for word, n in getWords(data):
#### Most obvious version ###
if not word in storage:
storage[word] = [(n, url)]
else:
storage[word].append((n, url))
#### it is easier to ask for forgiveness than for permission ###
#
# try:
# storage[word].append((n, url))
# except KeyError:
# storage[word] = [(n, url)]
#
print(storage)
if [] or "" or {} or () or 0 or 0.0 or 0.0 + 0j or None:
print("Some are True")
else:
print("All are False")
if 1:
print("1 is True")
t = (12345, 54321, 'hello!')
t[0]
t
# Tuples may be nested:
u = (t, (1, 2, 3, 4, 5))
u
u[0] = 4
# Beware of mutable inside unmutable!
v = (1, [2, 3], 4)
v
v[1] = range(5)
v[1][0] = "mutables can be modified!"
v
len(v)
for elt in v:
print(elt)
empty = ()
singleton = "hello", # <-- note the trailing comma
len(empty)
len(singleton)
singleton
('hello') # <-- without trailing comma
tel = {'jack': 4098, 'sape': 4139}
tel['guido'] = 4127
tel
tel['jack']
del tel['sape']
tel['irv'] = 4127
tel
list(tel.keys())
'guido' in tel
Building a dict from a list
dict([('sape', 4139), ('guido', 4127), ('jack', 4098)])
help(__builtin__.dict)
knights = {'gallahad': 'the pure', 'robin': 'the brave'}
for k, v in knights.items(): # order NOT fixed! BEWARE !!!
print("Sir", k[0].upper() + k[1:], v)
#!/usr/bin/env python3
# vim: set fileencoding=utf-8 :
# Grab.py - version 05
# - grab an URL
# - isolate words, count occurencies
# - store everything into a dict
# - crawl from the URL and repeat the treatment on linked pages
import sys
import urllib.request
import re
# data
URLPREF = "http://"
MIN_WORD_LENGHT = 3
storage = {}
# tools
def getWords(data):
"""return the list of tuples with
- words contained within a HTML string
- number of occurencies of this word within the string"""
data = re.sub(r"<head>.*(.*\n)*.*</head>", "", data, flags=re.MULTILINE)
data = re.sub(r"<[^<>]*>", "", data, flags=re.MULTILINE)
data = re.sub(r"&[^ ;]*;", "", data)
data = re.sub(r"[,.;:!?\"'()\[\]{}%$€@#/+*_`-]", " ", data)
words = data.split()
### List comprehension to do 3 steps in a single one ###
words = [w.lower() for w in words if len(w) > 3 and not re.search("[0-9]",w)]
#
# This has the same effect as
## words = filter(lambda x: len(x) > 3, words)
## words = list(filter(lambda x: not re.search("[0-9]",x), words))
## words = list(map(lambda x: x.lower(), words))
words.sort() # sort *in place*, returns None !
result = []
for word in words:
if result and word == result[-1][0]: # first elt. of the last tuple of result
result[-1] = (word, result[-1][1] + 1)
continue
result.append((word, 1))
return result
def storeWords(words, url):
"""
Store words into our storage dict, but using dict instead of list
{word : {URL1: #1, URL2: #2, ...}
"""
for word, n in words:
# the most efficient way to initialize/modify a dict entry
storage.setdefault(word, {})[url] = n
# | |
# +--------------------------+
# |
# v
# return storage[word]
# if it does not exist, add it
# with {} as value before returning
# ============ Do something usefull from here ============
# 1) args management
URLPREF = "http://"
def usage():
print("usage: ", sys.argv[0], "<url to parse>")
sys.exit(1)
if not (len(sys.argv) == 2 and sys.argv[1].startswith(URLPREF)) :
usage()
url = sys.argv[1]
print("You ask to grab", url)
# 2) retreive the URL
raw_data = urllib.request.urlopen(url)
data = raw_data.read().decode("utf-8")
# 3) isolate words and store them
storeWords(getWords(data), url)
# 4) isolate external links (very simple implementation)
links = re.findall(r"(?i)<a[^>]*href=\"([^\"#]*)\"", data)
print(links)
input()
# 5) repeat the treatment on these URLs
for link in links:
if not link.startswith(URLPREF): # relative links
link = re.sub("/[^/]*$", "/", url) + link
print(" -> grabbing", link)
try:
raw_data = urllib.request.urlopen(link).read().decode("utf-8")
storeWords(getWords(raw_data), link)
except:
pass # ignore errors : don't do that ;-)
words = list(storage.keys())
words.sort()
for word in words:
# get a list of urls mentionning word, sorted by descending number of occurences
urls = sorted(storage[word].items(), key=lambda x: x[1], reverse=True) # cfr. itemgetter from operator module
print(word, urls)
freshfruit = [' banana', ' loganberry \n', 'passion fruit ']
[weapon.strip() for weapon in freshfruit]
Syntax:
[ value with a variable for variable in an iterable ]
vec = [2, 4, 6]
[3*x for x in vec]
[3*x for x in vec if x > 3] # add conditions
[3*x for x in vec if x < 2]
[[x,x**2] for x in vec] # build (potentialy) complex objects
With more than one variable too!
vec1 = [2, 4, 6]
vec2 = [4, 3, -9]
[x*y for x in vec1 for y in vec2] # combine all possible pairs
[vec1[i]*vec2[i] for i in range(len(vec1))] # explicit association by index
[v1*v2 for v1, v2 in zip(vec1, vec2)] # zip does that
sorted([5, 2, 3, 1, 4]) # sorted returns the sorted object
print( [5, 2, 3, 1, 4].sort() ) # sort method sort in place, returns nothing
l = [5, 2, 3, 1, 4]
l.sort()
l
sorted("copyright") # works with ANY iterable
The key parameter
sorted("This is a test string from Andrew".split())
sorted("This is a test string from Andrew".split(), key=str.lower)
student_tuples = [ ('john', 'A', 15), # (name, grade, age)
('jane', 'B', 12),
('dave', 'B', 10) ]
# sort by age
sorted(student_tuples, key=lambda student: student[2])
The reverse parameter
sorted(student_tuples, key=lambda student: student[2], reverse=True)
With all this in mind, you can understand the Grab example.