- #!/bin/sh
- make > /dev/null
- cat /net/corpora/twitter2/Samples/OneDayCorrected/*nl.txt | twclean > tw-nl.txt
- cat /net/corpora/twitter2/Samples/OneDayCorrected/*other.txt | twclean > tw-other.txt
- python3.1 split2.py tw-nl.txt
- mv out1.tmp tw-nl1.txt
- mv out2.tmp tw-nl2.txt
- python3.1 split2.py tw-other.txt
- mv out1.tmp tw-other1.txt
- mv out2.tmp tw-other2.txt
- textpat < tw-nl1.txt > tw-nl1.pat
- textpat < tw-other1.txt > tw-other1.pat
- echo Nederlandstalige tweets:
- wc -l tw-nl2.txt
- # tweede waarde is van wanneer de test- en leersets worden verwisseld
- echo False negatives originele textcat:
- ./oldtextcat < tw-nl2.txt | grep -v SHORT | grep -v '^\[dutch\]' | wc -l
- # 2337, 2367
- echo False negatives standaard:
- textcat -l -i=af.utf8,fy.utf8 -f tw-nl2.txt | grep -v SHORT | grep -v '^nl' | wc -l
- # 674, 674
- echo False negatives tweetcat:
- textcat -l -p=tw-nl1.pat,tw-other1.pat -i=nl.utf8,af.utf8,fy.utf8 -f tw-nl2.txt | grep -v SHORT | grep -v '^tw-nl' | wc -l
- # 920, 915
- echo False negatives gecombineerd:
- textcat -l -p=tw-nl1.pat,tw-other1.pat -i=af.utf8,fy.utf8 -f tw-nl2.txt | grep -v SHORT | grep -v '^tw-nl' | grep -v '^nl' | wc -l
- # 855, 836
- echo
- echo Overige tweets:
- wc -l tw-other2.txt
- echo False positives originele textcat:
- ./oldtextcat < tw-other2.txt | grep '^\[dutch\]' | wc -l
- # 545, 569
- echo False positives standaard:
- textcat -l -i=af.utf8,fy.utf8 -f tw-other2.txt | grep '^nl' | wc -l
- # 1262, 1277
- echo False positives tweetcat:
- textcat -l -p=tw-nl1.pat,tw-other1.pat -i=nl.utf8,af.utf8,fy.utf8 -f tw-other2.txt | grep -E '^(tw-)?nl' | wc -l
- # 330, 388
- echo False positives gecombineerd:
- textcat -l -p=tw-nl1.pat,tw-other1.pat -i=af.utf8,fy.utf8 -f tw-other2.txt | grep -E '^(tw-)?nl' | wc -l
- # 326, 389
- ################################################################
- echo
- wc -l unfiltered.txt
- textpat < tw-nl.txt > tw-nl.pat
- textpat < tw-other.txt > tw-other.pat
- ./oldtextcat < tw-nl.txt | grep -v ^SHORT | perl -p -e 's/.*? //' > tw-nl-unshort1.txt
- ./oldtextcat < unfiltered.txt | grep '^\[dutch\]' | perl -p -e 's/.*? //' > tmp
- echo originele textcat, false negatives + 1
- diff -U0 tw-nl-unshort1.txt tmp | grep '^-' | wc -l
- # 4705
- echo originele textcat, false positives + 1
- diff -U0 tw-nl-unshort1.txt tmp | grep '^+' | wc -l
- # 5940
- textcat -l -f tw-nl.txt | grep -v ^SHORT | perl -p -e 's/.*?\t//' > tw-nl-unshort2.txt
- textcat -l -i=af.utf8,fy.utf8 -f unfiltered.txt | grep '^nl' | perl -p -e 's/.*?\t//' > tmp
- echo standaard, false negatives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
- # 1349
- echo standaard, false positives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
- # 2540
- # tw-nl ...
- textcat -l -p=tw-nl.pat -i=nl.utf8,af.utf8,fy.utf8 -f unfiltered.txt | grep '^tw-nl' | perl -p -e 's/.*?\t//' > tmp
- echo tw-nl i.p.v. nl, false negatives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
- # 123
- echo tw-nl i.p.v. nl, false positives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
- # 36454
- # tw-other nl ...
- textcat -l -p=tw-other.pat -i=af.utf8,fy.utf8 -f unfiltered.txt | grep '^nl' | perl -p -e 's/.*?\t//' > tmp
- echo tw-other + nl, false negatives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
- # 6277
- echo tw-other + nl, false positives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
- # 120
- # tw-nl nl ...
- textcat -l -p=tw-nl.pat -i=af.utf8,fy.utf8 -f unfiltered.txt | grep -E '^(tw-)?nl' | perl -p -e 's/.*?\t//' > tmp
- echo tw-nl + nl, false negatives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
- # 111
- echo tw-nl + nl, false positives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
- # 36029
- # tw-nl tw-other ...
- textcat -l -p=tw-nl.pat,tw-other.pat -i=nl.utf8,af.utf8,fy.utf8 -f unfiltered.txt | grep '^tw-nl' | perl -p -e 's/.*?\t//' > tmp
- echo tw-nl i.p.v. nl + tw-other, false negatives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
- # 1836
- echo tw-nl i.p.v. nl + tw-other, false positives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
- # 4879
- # tw-nl nl tw-other ...
- textcat -l -p=tw-nl.pat,tw-other.pat -i=af.utf8,fy.utf8 -f unfiltered.txt | grep -E '^(tw-)?nl' | perl -p -e 's/.*?\t//' > tmp
- echo tw-nl + nl + tw-other, false negatives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
- # 1688
- echo tw-nl + nl + tw-other, false positives + 1
- diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
- # 4874
- rm tmp
Raw Paste