file bible.txt.gz
gunzip -c bible.txt.gz | more
zmore bible.txt.gz
gunzip -c bible.txt.gz | less
gunzip -c bible.txt.gz | tail
gunzip -c bible.txt.gz | head
gunzip -c bible.txt.gz | wc
gunzip -c bible.txt.gz | wc
gunzip bible.txt
Algorithm
tr -sc 'A-Za-z' '\012' < bible.txt | sort | uniq -c | more
1
7973 a
236 A
1 aa
350 Aaron
2 Aaronites
1 Abaddon
1 Abagtha
1 Abana
4 Abarim
...
read from input file <
write to output file >
pipe |
1) more bible.txt
...
1:1 In the beginning God created the heaven and
1:2 And the earth was without form, and void; an
1:3 And God said, Let there be light: and there
1:4 And God saw the light, that [it was] good: a
...
2) tr -sc 'A-Za-z' '\012' < bible.txt | more
DOC
Welcome
To
The
World
...
3) Filtering with a simple gawk program ...
gunzip -c bible.txt.gz | tr -sc 'A-Za-z' '\012' | gawk 'BEGIN{flag=0};$0~/\<TEXT\>/{flag=1;next};$0~/\<\/TEXT\>/{flag=0;next};{if(flag>0){print}}' > bible.clean
4) Ordering and counting ...
tr -sc 'A-Za-z' '\012' < bible.clean | sort | uniq -c | more
7943 a
234 A
350 Aaron
2 Aaronites
...
tr 'a-z' 'A-Z' < bible.clean |
tr -sc 'A-Z' '\012' |
sort |
uniq -c
tr 'a-z' 'A-Z' < bible.clean |
tr -sc 'AEIOU' '\012' |
sort |
uniq -c
tr 'a-z' 'A-Z' < bible.clean |
tr -sc 'BCDFGHJKLMNPQRSTVWXYZ' '\012' |
sort |
uniq -c
Example Explanation
sort -d dictionary order
sort-f fold case
sort-e numeric order
sort-nr reverse numeric order
sort +1 start with field 1 (starting from 0)
sort +0.50 start with 50th character
sort +1.5 start with 5th character of field 1
tr -sc 'A-Za-z' '\012' < bible.clean | sort | uniq -c | sort -nr > bible.hist
. . .
1 freely
1 sorely
5 Surely
15 surely
1 falsely
1 fly
. . .
echo hello world | rev
dlrow olleh
echo hello world | rev | rev
hello world
tr -sc 'A-Za-z' '\012' < bible.clean > bible.words
tail -n +2 bible.words > bible.nextwords
paste bible.words bible.nextwords | more
The Old
Old Testament
Testament of
of the
...
paste bible.words bible.nextwords | sort | uniq -c > bible.bigrams
sort -nr < bible.bigrams | more
11445 of the
5964 the LORD
4880 in the
4044 and the
2461 shall be
...
tr -sc 'A-Za-z' '\012' < bible.clean | grep 'ing$' | sort | uniq -c | more
Example Explanation
grep gh find lines containing "gh''
grep '^con' find lines beginning with "con"
grep 'ing$' find lines ending with "in"
grep -v gh don't display lines containing "gh"
grep -v '^con' don't display lines beginning with "con"
grep -v 'ing$' don't display lines ending with "ing"
Example explanation
grep '[A-Z] lines with an uppercase char
grep '^[A-Z] lines starting with an uppercase
grep '[A-Z]$' lines ending with an uppercase
grep '^[A-Z]|*$' lines with all uppercase charsgrep '[aeiouAEIOU]' lines with a vowel
grep '^[aeiouAEIOU]' lines starting with a vowel
grep '[aeiouAEIOU]$' lines ending with a vowel
grep -i '[aeiou]' ditto
grep -i '^[aeiou]'
grep -i '[aeiou]$'grep-i '^[^aeiou]' lines starting with a non-vowel
grep -i ' [^aeiou]$' lines ending with a non-vowel
grep -i ' [aeiou].*[aeiou]' lines with two or more vowels
grep-i '^[^aeiou]*[aeiou][^aeiou]*$' lines with exactly one vowel
Example Explanation
a match the letter "a"
[a-z] match any lowercase letter
[A-Z] match any uppercase letter
[0-9] match any digit
[0123456789] match any digit
[aeiouAEIUO] match any vowel
[^aeiouAEIOU] match any letter but a vowel
. match any character
^ beginning of line
$ end of line
x* any number of x
x+ one or more of x (egrep only)
x | y x or y (egrep only)
(x) override precedence rules (egrep only)
sed 5q < bible.clean
sed '/light/q' bible.clean
Example Explanation
sed 's/light/dark/g'
sed 's/ly$/-ly/g' simple morph prog
sed 's/[ \011].*//g' select first field
echo darkness | spell
+ness darkness
awk '{print $1}'
cut -f1
print the second field
awk '{print $2}'
cut -f2
print the last field
awk '{print $NF}'
rev | cut -f1 | rev
awk '{print $(NF-1)}'
rev | cut -f2 | rev
print the number of fields
awk '{print NF}'
awk '$1 > 100 {print $0}' bible.hist
awk '$1 > 100 {print}' bible.hist
awk '$1 > 100' bible.hist
sort -u bible.words > bible.types
rev < bible.types | paste - bible.types | awk '$1 == $2'
a a
A A
aha aha
deed deed
did did
...
rev < bible.types | cat - bible.types | sort | uniq -c | awk '$1 >= 2 {print $2}'
a
A
ah
aha
dam
deed
deeps
...
lookup words ending in "ed"
awk '$2~/ed$/' bible.hist
grep 'ed$' bible.hist
awk '$2~/ed$/ {x = x + $1} END{print x}' bible.hist
tr -sc 'A-Za-z' '\012' < bible.clean | grep 'ed$' | wc -l
awk '$2~/ed$/ {x = x + 1} END{print x}' bible.hist
tr -sc 'A-Za-z' '\012' < bible.clean | grep 'ed$' | sort | uniq -c | wc -l
awk '/ed$/ {token = token + $1;
type = type + 1}
END {print token, type}' bible.hist
awk '/ed$/ {token += $1; type++}
END {print token, type}' bible.hist
Two programs for counting word frequencies:
tr -sc 'A-Za-z' '\012' < bible.clean | sort | uniq -c
tr -sc 'A-Za-z' '\012' < bible.clean | awk '{ freq[$0]++ }; END{for(w in freq) print freq[w], w }'
I(x;y) = log2 Pr(x,y) / Pr(x) Pr(y)
I(x;y) ~ log2 N f(x,y)/ f(x) f(y)
paste bible.words bible.nextwords | sort | uniq -c > bible.bigrams
cat bible.hist bible.bigrams |
awk 'NF == 2 { f[$2]=$1}
NF == 3 { print log(N*$1/(f[$2]*f[$3]))/log(2), $2, $3}'
where N='wc -l bible.words'
Exercice 6: Mutual information is unstable for small bigram counts. Modify the previous program so that it doesn't produce any output when the bigram count is less than 5.