Commit c715603b authored by Franck Thollard's avatar Franck Thollard
Browse files

adding different ways of solving the smat completion DIY

parent 2f31e2af
......@@ -146,7 +146,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 1,
"metadata": {},
"outputs": [
{
......@@ -162,7 +162,7 @@
"# \"pasting\" two lists can be done using zip\n",
"l1 = [1, 2, 3]\n",
"s = 'abc'\n",
"print(list(zip(l1, l2)))\n",
"print(list(zip(l1, s)))\n",
"print(list(zip('abc', 'defg')))"
]
},
......@@ -563,15 +563,15 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"integer remove from the list: 5\n",
"shuffled list: [4, 19, 14, 1, 8, 17, 15, 2, 3, 12, 0, 6, 16, 9, 11, 10, 7, 13, 18]\n"
"integer remove from the list: 3\n",
"shuffled list: [4, 2, 5, 16, 15, 6, 9, 18, 8, 7, 13, 11, 17, 14, 12, 0, 19, 1, 10]\n"
]
}
],
......@@ -610,16 +610,16 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{5}"
"3"
]
},
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
......@@ -627,7 +627,8 @@
"source": [
"full_set = set(range(n))\n",
"changed_set = set(l)\n",
"full_set - changed_set"
"ns = full_set - changed_set\n",
"ns.pop()"
]
},
{
......@@ -642,7 +643,8 @@
" \n",
" -> Complixity of the whole algorithm : O(n)\n",
" \n",
"# Note "
"# Complexity of the \"sum\" solution : \n",
" - One traversal for the computation of the sum O(n) with sum at each step O(1) -> O(n) "
]
},
{
......@@ -1010,6 +1012,96 @@
"- Given a query pattern of size 2, propose the pattern of size 3 with the same prefix that has the highest frequency. Filter the keys of the previous dictionary so that they starts with the query pattern."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"len(s) = 7160000, nbkeys 33 base, count, count_count, except, colection.counter\n",
"1 loop, best of 5: 550 ms per loop\n",
"1 loop, best of 5: 495 ms per loop\n",
"10 loops, best of 5: 146 ms per loop\n",
"1 loop, best of 5: 488 ms per loop\n",
"1 loop, best of 5: 265 ms per loop\n",
"1 loop, best of 5: 450 ms per loop\n",
"with split\n",
"len(s) = 1100000, nbkeys 90 base, count, count_count, except, colection.counter\n",
"10 loops, best of 5: 123 ms per loop\n",
"10 loops, best of 5: 113 ms per loop\n",
"1 loop, best of 5: 992 ms per loop\n",
"10 loops, best of 5: 101 ms per loop\n",
"10 loops, best of 5: 62.3 ms per loop\n",
"1 loop, best of 5: 449 ms per loop\n"
]
}
],
"source": [
"def build_count_base(t): \n",
" d = {} \n",
" for s in t:\n",
" if s in d:\n",
" d[s] += 1\n",
" else: \n",
" d[s] = 1\n",
" return d\n",
"\n",
"def build_count_set(t): \n",
" d = {k:0 for k in set(t)}\n",
" for s in t:\n",
" d[s] += 1\n",
" return d\n",
"\n",
"def build_count_count(t):\n",
" d = {k:t.count(k) for k in set(t)}\n",
" return d\n",
"\n",
"def build_count_excpt(t): \n",
" d = {} \n",
" for s in t:\n",
" try: \n",
" d[s] += 1\n",
" except:\n",
" d[s] = 1\n",
" return d\n",
"\n",
"import collections\n",
"\n",
"def build_count_counter(t):\n",
" return collections.Counter(t)\n",
"\n",
"def build_count_defaultdict(t):\n",
" d = collections.defaultdict(int)\n",
" for k in s:\n",
" d[k] += 1\n",
" return d\n",
"\n",
"s = \"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam tristique at velit in varius. Cras ut ultricies orci. Fusce vel consequat ante, vitae luctus tortor. Sed condimentum faucibus enim, sit amet pulvinar ligula feugiat ac. Sed interdum id risus id rhoncus. Nullam nisi justo, ultrices eu est nec, hendrerit maximus lorem. Nam urna eros, accumsan nec magna eu, elementum semper diam. Nulla tempus, nibh id elementum dapibus, ex diam lacinia est, sit amet suscipit nulla nibh eu sapien. Aliquam orci enim, malesuada in facilisis vitae, pharetra sit amet mi. Pellentesque mi tortor, sagittis quis odio quis, fermentum faucibus ex. Aenean sagittis nisl orci. Maecenas tristique velit sed leo facilisis porttitor. \"\n",
"s = s*10000\n",
"len(s)\n",
"print(f\"len(s) = {len(s)}, nbkeys {len(set(s))} base, count, count_count, except, colection.counter\")\n",
"%timeit build_count_base(s)\n",
"%timeit build_count_set(s)\n",
"%timeit build_count_count(s)\n",
"%timeit build_count_excpt(s)\n",
"%timeit build_count_counter(s)\n",
"%timeit build_count_defaultdict(s)\n",
"\n",
"print(\"with split\")\n",
"s2 = s.split()\n",
"print(f\"len(s) = {len(s2)}, nbkeys {len(set(s2))} base, count, count_count, except, colection.counter\")\n",
"%timeit build_count_base(s2)\n",
"%timeit build_count_set(s2)\n",
"%timeit build_count_count(s2)\n",
"%timeit build_count_excpt(s2)\n",
"%timeit build_count_counter(s2)\n",
"%timeit build_count_defaultdict(s2)\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
......@@ -1158,18 +1250,6 @@
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2+"
}
},
"nbformat": 4,
......
%% Cell type:markdown id: tags:
# Python training UGA 2017
**A training to acquire strong basis in Python to use it efficiently**
Pierre Augier (LEGI), Cyrille Bonamy (LEGI), Eric Maldonado (Irstea), Franck Thollard (ISTerre), Christophe Picard (LJK), Loïc Huder (ISTerre)
# [Data structures](https://docs.python.org/3.6/tutorial/datastructures.html)
%% Cell type:markdown id: tags:
### list: mutable sequence
Lists are mutable ordered tables of inhomogeneous objects. They can be viewed as an array of references (nearly pointers) to objects.
%% Cell type:code id: tags:
``` python
# 2 equivalent ways to define an empty list
l0 = []
l1 = list()
assert l0 == l1
# not empty lists
l2 = ['a', 2]
l3 = list(range(3))
print(l2, l3, l2 + l3)
print(3*l2)
```
%%%% Output: stream
['a', 2] [0, 1, 2] ['a', 2, 0, 1, 2]
['a', 2, 'a', 2, 'a', 2]
%% Cell type:markdown id: tags:
The `itertools` module provide other ways of iterating over lists or set of lists (e.g. cartesian product, permutation, filter, ... ): https://docs.python.org/3/library/itertools.html
%% Cell type:markdown id: tags:
### list: mutable sequence
The builtin function `dir` returns a list of name of the attributes. For a list, these attributes are python system attributes (with double-underscores) and 11 public methods:
%% Cell type:code id: tags:
``` python
print(dir(l3))
```
%%%% Output: stream
['__add__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort']
%% Cell type:code id: tags:
``` python
l3.append(10)
print(l3)
l3.reverse()
print(l3)
```
%%%% Output: stream
[0, 1, 2, 10]
[10, 2, 1, 0]
%% Cell type:code id: tags:
``` python
# Built-in functions applied on lists
# return lower value
print(min(l3))
# return higher value
print(max(l3))
# return sorted list
print(sorted([5, 2, 10, 0]))
```
%%%% Output: stream
0
10
[0, 2, 5, 10]
%% Cell type:code id: tags:
``` python
# "pasting" two lists can be done using zip
l1 = [1, 2, 3]
s = 'abc'
print(list(zip(l1, l2)))
print(list(zip(l1, s)))
print(list(zip('abc', 'defg')))
```
%%%% Output: stream
[(1, 'a'), (2, 'b'), (3, 'c')]
[('a', 'd'), ('b', 'e'), ('c', 'f')]
%% Cell type:markdown id: tags:
### `list`: list comprehension
They are iterable so they are often used to make loops. We have already seen how to use the keyword `for`. For example to build a new list (side note: `x**2` computes `x^2`):
%% Cell type:code id: tags:
``` python
l0 = [1, 4, 10]
l1 = []
for number in l0:
l1.append(number**2)
print(l1)
```
%%%% Output: stream
[1, 16, 100]
%% Cell type:markdown id: tags:
There is a more readable (and slightly more efficient) method to do such things, the "list comprehension":
%% Cell type:code id: tags:
``` python
l1 = [number**2 for number in l0]
print(l1)
```
%%%% Output: stream
[1, 16, 100]
%% Cell type:code id: tags:
``` python
# list comprehension with a condition
[s for s in ['a', 'bbb', 'e'] if len(s) == 1]
```
%%%% Output: execute_result
['a', 'e']
%% Cell type:code id: tags:
``` python
# lists comprehensions can be cascaded
[(x,y) for x in [1,2] for y in ['a','b'] ]
```
%%%% Output: execute_result
[(1, 'a'), (1, 'b'), (2, 'a'), (2, 'b')]
%% Cell type:markdown id: tags:
### Do it yourself (advanced)
- Write a function `extract_patterns(text, n=3)` extracting the list of patterns of size `n=3` from a long string (e.g. if `text = "basically"`, patterns would be the list `['bas', 'asi', 'sic', ..., 'lly']`). Use list comprehension, range, slicing. Use a sliding window.
- You can apply your function to a long "ipsum lorem" string (ask to your favorite web search engine).
%% Cell type:markdown id: tags:
#### A possible solution
%% Cell type:code id: tags:
``` python
text = "basically"
def extract_patterns(text, n=3):
pat = [text[i:i+n] for i in range(len(text)-n+1)]
return pat
print("patterns=", extract_patterns(text))
print("patterns=", extract_patterns(text, n=5))
```
%%%% Output: stream
patterns= ['bas', 'asi', 'sic', 'ica', 'cal', 'all', 'lly']
patterns= ['basic', 'asica', 'sical', 'icall', 'cally']
%% Cell type:markdown id: tags:
### `tuple`: immutable sequence
Tuples are very similar to lists but they are immutable (they can not be modified).
%% Cell type:code id: tags:
``` python
# 2 equivalent notations to define an empty tuple (not very useful...)
t0 = ()
t1 = tuple()
assert t0 == t1
# not empty tuple
t2 = (1, 2, 'a') # with the parenthesis
t2 = 1, 2, 'a' # it also works without parenthesis
t3 = tuple(l3) # from a list
```
%% Cell type:code id: tags:
``` python
# tuples only have 2 public methods (with a list comprehension)
[name for name in dir(t3) if not name.startswith('__')]
```
%%%% Output: execute_result
['count', 'index']
%% Cell type:code id: tags:
``` python
# assigment of multiple variables in 1 line
a, b = 1, 2
print(a, b)
# exchange of values
b, a = a, b
print(a, b)
```
%%%% Output: stream
1 2
2 1
%% Cell type:markdown id: tags:
### `tuple`: immutable sequence
Tuples are used *a lot* with the keyword `return` in functions:
%% Cell type:code id: tags:
``` python
def myfunc():
return 1, 2, 3
t = myfunc()
print(type(t), t)
# Directly unpacking the tuple
a, b, c = myfunc()
print(a, b, c)
```
%%%% Output: stream
<class 'tuple'> (1, 2, 3)
1 2 3
%% Cell type:markdown id: tags:
### `set`: a hashtable
Unordered collections of unique elements (a hashtable). Sets are mutable. The elements of a set must be [hashable](https://docs.python.org/3/glossary.html#term-hashable).
%% Cell type:code id: tags:
``` python
s0 = set()
```
%% Cell type:code id: tags:
``` python
{1, 1, 1, 3}
```
%%%% Output: execute_result
{1, 3}
%% Cell type:code id: tags:
``` python
set([1, 1, 1, 3])
```
%%%% Output: execute_result
{1, 3}
%% Cell type:code id: tags:
``` python
s1 = {1, 2}
s2 = {2, 3}
print(s1.intersection(s2))
print(s1.union(s2))
```
%%%% Output: stream
{2}
{1, 2, 3}
%% Cell type:markdown id: tags:
### `set`: lookup
Hashtable lookup (for example `1 in s1`) is algorithmically efficient (complexity O(1)), i.e. theoretically faster than a look up in a list or a tuple (complexity O(size iterable)).
%% Cell type:code id: tags:
``` python
print(1 in s1, 1 in s2)
```
%%%% Output: stream
True False
%% Cell type:markdown id: tags:
### What is a hashtable?
https://en.wikipedia.org/wiki/Hash_table
%% Cell type:code id: tags:
``` python
from random import shuffle, randint
n = 20
i = randint(0, n-1)
print('integer remove from the list:', i)
l = list(range(n))
l.remove(i)
shuffle(l)
print('shuffled list: ', l)
```
%%%% Output: stream
integer remove from the list: 5
shuffled list: [4, 19, 14, 1, 8, 17, 15, 2, 3, 12, 0, 6, 16, 9, 11, 10, 7, 13, 18]
integer remove from the list: 3
shuffled list: [4, 2, 5, 16, 15, 6, 9, 18, 8, 7, 13, 11, 17, 14, 12, 0, 19, 1, 10]
%% Cell type:markdown id: tags:
## DIY: back to the "find the removed element" problem
- Could the problem be solved using set ?
- What is the complexity of this solution ?
%% Cell type:markdown id: tags:
## A possible solution :
%% Cell type:code id: tags:
``` python
full_set = set(range(n))
changed_set = set(l)
full_set - changed_set
ns = full_set - changed_set
ns.pop()
```
%%%% Output: execute_result
{5}
3
%% Cell type:markdown id: tags:
## Complity :
- line 1: n insertions --> O(n)
- line 2 : n insertions --> O(n)
- line 3: one traversal O(n), with one lookup at each time (O(1) -> O(n)
-> Complixity of the whole algorithm : O(n)
# Note
# Complexity of the "sum" solution :
- One traversal for the computation of the sum O(n) with sum at each step O(1) -> O(n)
%% Cell type:markdown id: tags:
### `dict`: unordered set of key: value pairs
The dictionary (`dict`) is a very important data structure in Python. All namespaces are (nearly) dictionaries and "Namespaces are one honking great idea -- let's do more of those!" (The zen of Python).
A dict is a hashtable (a set) + associated values.
%% Cell type:code id: tags:
``` python
d = {}
d['b'] = 2
d['a'] = 1
print(d)
```
%%%% Output: stream
{'b': 2, 'a': 1}
%% Cell type:code id: tags:
``` python
d = {'a': 1, 'b': 2, 0: False, 1: True}
print(d)
```
%%%% Output: stream
{'a': 1, 'b': 2, 0: False, 1: True}
%% Cell type:markdown id: tags:
### Tip: parallel between `dict` and `list`
You can first think about `dict` as a super `list` which can be indexed with other objects than integers (and in particular with `str`).
%% Cell type:code id: tags:
``` python
l = ["value0", "value1"]
l.append("value2")
print(l)
```
%%%% Output: stream
['value0', 'value1', 'value2']
%% Cell type:code id: tags:
``` python
l[1]
```
%%%% Output: execute_result
'value1'
%% Cell type:code id: tags:
``` python
d = {"key0": "value0", "key1": "value1"}
d["key2"] = "value2"
print(d)
```
%%%% Output: stream
{'key0': 'value0', 'key1': 'value1', 'key2': 'value2'}
%% Cell type:code id: tags:
``` python
d["key1"]
```
%%%% Output: execute_result
'value1'
%% Cell type:markdown id: tags:
But warning, `dict` are not ordered (since they are based on a hashtable)!
%% Cell type:markdown id: tags:
### `dict`: public methods
%% Cell type:code id: tags:
``` python
# dict have 11 public methods (with a list comprehension)
[name for name in dir(d) if not name.startswith('__')]
```
%%%% Output: execute_result
['clear',
'copy',
'fromkeys',
'get',
'items',
'keys',
'pop',
'popitem',
'setdefault',
'update',
'values']
%% Cell type:markdown id: tags:
### `dict`: different ways to loops over a dictionary
%% Cell type:code id: tags:
``` python
# loop with items
for key, value in d.items():
if isinstance(key, str):
print(key, value)
```
%%%% Output: stream
key0 value0
key1 value1
key2 value2
%% Cell type:code id: tags:
``` python
# loop with values
for value in d.values():
print(value)
```
%%%% Output: stream
value0
value1
value2
%% Cell type:code id: tags:
``` python
# loop with keys
for key in d.keys():
print(key)
```
%%%% Output: stream
key0
key1
key2
%% Cell type:code id: tags:
``` python
# dict comprehension (here for the "inversion" of the dictionary)
print(d)
d1 = {v: k for k, v in d.items()}
```
%%%% Output: stream
{'key0': 'value0', 'key1': 'value1', 'key2': 'value2'}
%% Cell type:markdown id: tags:
## Do it yourself:
Write a function that returns a dictionary containing the number of occurrences of letters in a text.
%% Cell type:code id: tags:
``` python
text = 'abbbcc'
```
%% Cell type:markdown id: tags:
#### A possible solution:
%% Cell type:code id: tags:
``` python
def count_elem(sequence):
d = {}
for letter in sequence:
if letter not in d:
d[letter] = 1
else:
d[letter] += 1
return d
print("text=", text, "counts=", count_elem(text))
```
%%%% Output: stream
text= Las Vegas Overlook Loop is a 6.3 mile loop trail located near Las Vegas counts= {'L': 3, 'a': 8, 's': 5, ' ': 13, 'V': 2, 'e': 6, 'g': 2, 'O': 1, 'v': 1, 'r': 3, 'l': 5, 'o': 7, 'k': 1, 'p': 2, 'i': 3, '6': 1, '.': 1, '3': 1, 'm': 1, 't': 2, 'c': 1, 'd': 1, 'n': 1}
%% Cell type:markdown id: tags:
## Do it yourself : smart completion (advanced)
We will reuse our function `extract_patterns`.
- For a text, count the appearance of each pattern (using a dictionary).
- Given a query pattern of size 2, propose the pattern of size 3 with the same prefix that has the highest frequency. Filter the keys of the previous dictionary so that they starts with the query pattern.
%% Cell type:code id: tags:
``` python
def build_count_base(t):
d = {}
for s in t:
if s in d:
d[s] += 1
else:
d[s] = 1
return d
def build_count_set(t):
d = {k:0 for k in set(t)}
for s in t:
d[s] += 1
return d
def build_count_count(t):
d = {k:t.count(k) for k in set(t)}
return d
def build_count_excpt(t):
d = {}
for s in t:
try:
d[s] += 1
except:
d[s] = 1
return d
import collections
def build_count_counter(t):
return collections.Counter(t)
def build_count_defaultdict(t):
d = collections.defaultdict(int)
for k in s:
d[k] += 1
return d
s = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nam tristique at velit in varius. Cras ut ultricies orci. Fusce vel consequat ante, vitae luctus tortor. Sed condimentum faucibus enim, sit amet pulvinar ligula feugiat ac. Sed interdum id risus id rhoncus. Nullam nisi justo, ultrices eu est nec, hendrerit maximus lorem. Nam urna eros, accumsan nec magna eu, elementum semper diam. Nulla tempus, nibh id elementum dapibus, ex diam lacinia est, sit amet suscipit nulla nibh eu sapien. Aliquam orci enim, malesuada in facilisis vitae, pharetra sit amet mi. Pellentesque mi tortor, sagittis quis odio quis, fermentum faucibus ex. Aenean sagittis nisl orci. Maecenas tristique velit sed leo facilisis porttitor. "
s = s*10000
len(s)
print(f"len(s) = {len(s)}, nbkeys {len(set(s))} base, count, count_count, except, colection.counter")
%timeit build_count_base(s)
%timeit build_count_set(s)
%timeit build_count_count(s)
%timeit build_count_excpt(s)
%timeit build_count_counter(s)
%timeit build_count_defaultdict(s)
print("with split")
s2 = s.split()
print(f"len(s) = {len(s2)}, nbkeys {len(set(s2))} base, count, count_count, except, colection.counter")
%timeit build_count_base(s2)
%timeit build_count_set(s2)
%timeit build_count_count(s2)
%timeit build_count_excpt(s2)
%timeit build_count_counter(s2)
%timeit build_count_defaultdict(s2)
```
%%%% Output: stream
len(s) = 7160000, nbkeys 33 base, count, count_count, except, colection.counter
1 loop, best of 5: 550 ms per loop
1 loop, best of 5: 495 ms per loop
10 loops, best of 5: 146 ms per loop
1 loop, best of 5: 488 ms per loop
1 loop, best of 5: 265 ms per loop
1 loop, best of 5: 450 ms per loop
with split
len(s) = 1100000, nbkeys 90 base, count, count_count, except, colection.counter
10 loops, best of 5: 123 ms per loop
10 loops, best of 5: 113 ms per loop
1 loop, best of 5: 992 ms per loop
10 loops, best of 5: 101 ms per loop
10 loops, best of 5: 62.3 ms per loop
1 loop, best of 5: 449 ms per loop
%% Cell type:markdown id: tags:
#### A possible solution
%% Cell type:code id: tags:
``` python
text="Las Vegas Overlook Loop is a 6.3 mile loop trail located near Las Vegas"
def extract_patterns(text, n=3):
"extracts the patterns of size n from text and return it"
pat = [text[i:i+n] for i in range(len(text)-n+1)]
return pat
def guess(prefix, count):
"complete the prefix with the most probable pattern (according to count)"
# get all the pattern in keys of count that starts with prefix
# compat_prefix = DIY
# find among the compatible prefixes the one wich score best according to count
best_prefix = "?"
return best_prefix
patterns = extract_patterns(text)
print("patterns = ", patterns)
patterns_count = count_elem(patterns)
print("patterns_counts = ", patterns_count)
print("guess for oo = ", guess("oo", patterns_count))
print("guess for eg = ", guess("eg", patterns_count))
```
%%%% Output: stream
patterns = ['Las', 'as ', 's V', ' Ve', 'Veg', 'ega', 'gas', 'as ', 's O', ' Ov', 'Ove', 'ver', 'erl', 'rlo', 'loo', 'ook', 'ok ', 'k L', ' Lo', 'Loo', 'oop', 'op ', 'p i', ' is', 'is ', 's a', ' a ', 'a 6', ' 6.', '6.3', '.3 ', '3 m', ' mi', 'mil', 'ile', 'le ', 'e l', ' lo', 'loo', 'oop', 'op ', 'p t', ' tr', 'tra', 'rai', 'ail', 'il ', 'l l', ' lo', 'loc', 'oca', 'cat', 'ate', 'ted', 'ed ', 'd n', ' ne', 'nea', 'ear', 'ar ', 'r L', ' La', 'Las', 'as ', 's V', ' Ve', 'Veg', 'ega', 'gas']
patterns_counts = {'Las': 2, 'as ': 3, 's V': 2, ' Ve': 2, 'Veg': 2, 'ega': 2, 'gas': 2, 's O': 1, ' Ov': 1, 'Ove': 1, 'ver': 1, 'erl': 1, 'rlo': 1, 'loo': 2, 'ook': 1, 'ok ': 1, 'k L': 1, ' Lo': 1, 'Loo': 1, 'oop': 2, 'op ': 2, 'p i': 1, ' is': 1, 'is ': 1, 's a': 1, ' a ': 1, 'a 6': 1, ' 6.': 1, '6.3': 1, '.3 ': 1, '3 m': 1, ' mi': 1, 'mil': 1, 'ile': 1, 'le ': 1, 'e l': 1, ' lo': 2, 'p t': 1, ' tr': 1, 'tra': 1, 'rai': 1, 'ail': 1, 'il ': 1, 'l l': 1, 'loc': 1, 'oca': 1, 'cat': 1, 'ate': 1, 'ted': 1, 'ed ': 1, 'd n': 1, ' ne': 1, 'nea': 1, 'ear': 1, 'ar ': 1, 'r L': 1, ' La': 1}
guess for oo = ?
guess for eg = ?
%% Cell type:code id: tags:
``` python
text="Las Vegas Overlook Loop is a 6.3 mile loop trail located near Las Vegas"
def extract_patterns(text, n=3):
pat = [text[i:i+n] for i in range(len(text)-n+1)]
return pat
def guess(prefix, count):
"complete the prefix with the most probable pattern (according to count)"
# get all the pattern in keys of count that starts with prefix
compatibles_prefixes = [x for x in count.keys() if x.startswith(prefix)]
if len(compatibles_prefixes) == 0:
return None
best_prefix = compatibles_prefixes[0]
best_score = count[best_prefix]
for pref in compatibles_prefixes[1:]:
if best_score < count[best_prefix]:
best_score = count[pref]
best_prefix = pref
return best_prefix
patterns = extract_patterns(text)
print("patterns = ", patterns)
patterns_count = count_elem(patterns)
print("patterns_counts = ", patterns_count)
print("guess for oo = ", guess("oo", patterns_count))
print("guess for eg = ", guess("eg", patterns_count))
```
%%%% Output: stream
patterns = ['Las', 'as ', 's V', ' Ve', 'Veg', 'ega', 'gas', 'as ', 's O', ' Ov', 'Ove', 'ver', 'erl', 'rlo', 'loo', 'ook', 'ok ', 'k L', ' Lo', 'Loo', 'oop', 'op ', 'p i', ' is', 'is ', 's a', ' a ', 'a 6', ' 6.', '6.3', '.3 ', '3 m', ' mi', 'mil', 'ile', 'le ', 'e l', ' lo', 'loo', 'oop', 'op ', 'p t', ' tr', 'tra', 'rai', 'ail', 'il ', 'l l', ' lo', 'loc', 'oca', 'cat', 'ate', 'ted', 'ed ', 'd n', ' ne', 'nea', 'ear', 'ar ', 'r L', ' La', 'Las', 'as ', 's V', ' Ve', 'Veg', 'ega', 'gas']
patterns_counts = {'Las': 2, 'as ': 3, 's V': 2, ' Ve': 2, 'Veg': 2, 'ega': 2, 'gas': 2, 's O': 1, ' Ov': 1, 'Ove': 1, 'ver': 1, 'erl': 1, 'rlo': 1, 'loo': 2, 'ook': 1, 'ok ': 1, 'k L': 1, ' Lo': 1, 'Loo': 1, 'oop': 2, 'op ': 2, 'p i': 1, ' is': 1, 'is ': 1, 's a': 1, ' a ': 1, 'a 6': 1, ' 6.': 1, '6.3': 1, '.3 ': 1, '3 m': 1, ' mi': 1, 'mil': 1, 'ile': 1, 'le ': 1, 'e l': 1, ' lo': 2, 'p t': 1, ' tr': 1, 'tra': 1, 'rai': 1, 'ail': 1, 'il ': 1, 'l l': 1, 'loc': 1, 'oca': 1, 'cat': 1, 'ate': 1, 'ted': 1, 'ed ': 1, 'd n': 1, ' ne': 1, 'nea': 1, 'ear': 1, 'ar ': 1, 'r L': 1, ' La': 1}
guess for oo = ook
guess for eg = ega
%% Cell type:code id: tags:
``` python
text="Las Vegas Overlook Loop is a 6.3 mile loop trail located near Las Vegas"
def extract_patterns(text, n=3):
pat = [text[i:i+n] for i in range(len(text)-n+1)]
return pat
def guess(prefix, count):
"complete the prefix with the most probable pattern (according to count)"
# get all the pattern in keys of count that starts with prefix
compat_prefix = [(x, count[x]) for x in count.keys() if x.startswith(prefix)]
ordered_compat_pref = sorted(compat_prefix, key=lambda x: x[1],
reverse=True)
return ordered_compat_pref
patterns = extract_patterns(text)
print("patterns = ", patterns)
patterns_count = count_elem(patterns)
print("patterns_counts = ", patterns_count)
print("guess for oo = ", guess("oo", patterns_count))
print("guess for eg = ", guess("eg", patterns_count))
```
%%%% Output: stream
patterns = ['Las', 'as ', 's V', ' Ve', 'Veg', 'ega', 'gas', 'as ', 's O', ' Ov', 'Ove', 'ver', 'erl', 'rlo', 'loo', 'ook', 'ok ', 'k L', ' Lo', 'Loo', 'oop', 'op ', 'p i', ' is', 'is ', 's a', ' a ', 'a 6', ' 6.', '6.3', '.3 ', '3 m', ' mi', 'mil', 'ile', 'le ', 'e l', ' lo', 'loo', 'oop', 'op ', 'p t', ' tr', 'tra', 'rai', 'ail', 'il ', 'l l', ' lo', 'loc', 'oca', 'cat', 'ate', 'ted', 'ed ', 'd n', ' ne', 'nea', 'ear', 'ar ', 'r L', ' La', 'Las', 'as ', 's V', ' Ve', 'Veg', 'ega', 'gas']
patterns_counts = {'Las': 2, 'as ': 3, 's V': 2, ' Ve': 2, 'Veg': 2, 'ega': 2, 'gas': 2, 's O': 1, ' Ov': 1, 'Ove': 1, 'ver': 1, 'erl': 1, 'rlo': 1, 'loo': 2, 'ook': 1, 'ok ': 1, 'k L': 1, ' Lo': 1, 'Loo': 1, 'oop': 2, 'op ': 2, 'p i': 1, ' is': 1, 'is ': 1, 's a': 1, ' a ': 1, 'a 6': 1, ' 6.': 1, '6.3': 1, '.3 ': 1, '3 m': 1, ' mi': 1, 'mil': 1, 'ile': 1, 'le ': 1, 'e l': 1, ' lo': 2, 'p t': 1, ' tr': 1, 'tra': 1, 'rai': 1, 'ail': 1, 'il ': 1, 'l l': 1, 'loc': 1, 'oca': 1, 'cat': 1, 'ate': 1, 'ted': 1, 'ed ': 1, 'd n': 1, ' ne': 1, 'nea': 1, 'ear': 1, 'ar ': 1, 'r L': 1, ' La': 1}
guess for oo = [('oop', 2), ('ook', 1)]
guess for eg = [('ega', 2)]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment