BASH   32
all
Guest on 10th March 2023 12:22:15 AM


  1. #!/bin/sh
  2.  
  3. make > /dev/null
  4.  
  5. cat /net/corpora/twitter2/Samples/OneDayCorrected/*nl.txt | twclean > tw-nl.txt
  6. cat /net/corpora/twitter2/Samples/OneDayCorrected/*other.txt | twclean > tw-other.txt
  7.  
  8. python3.1 split2.py tw-nl.txt
  9. mv out1.tmp tw-nl1.txt
  10. mv out2.tmp tw-nl2.txt
  11.  
  12. python3.1 split2.py tw-other.txt
  13. mv out1.tmp tw-other1.txt
  14. mv out2.tmp tw-other2.txt
  15.  
  16. textpat < tw-nl1.txt > tw-nl1.pat
  17. textpat < tw-other1.txt > tw-other1.pat
  18.  
  19. echo Nederlandstalige tweets:
  20. wc -l tw-nl2.txt
  21.  
  22. # tweede waarde is van wanneer de test- en leersets worden verwisseld
  23.  
  24. echo False negatives originele textcat:
  25. ./oldtextcat < tw-nl2.txt | grep -v SHORT | grep -v '^\[dutch\]' | wc -l
  26. # 2337, 2367
  27.  
  28. echo False negatives standaard:
  29. textcat -l -i=af.utf8,fy.utf8 -f tw-nl2.txt | grep -v SHORT | grep -v '^nl' | wc -l
  30. # 674, 674
  31.  
  32. echo False negatives tweetcat:
  33. textcat -l -p=tw-nl1.pat,tw-other1.pat -i=nl.utf8,af.utf8,fy.utf8 -f tw-nl2.txt | grep -v SHORT | grep -v '^tw-nl' | wc -l
  34. # 920, 915
  35.  
  36. echo False negatives gecombineerd:
  37. textcat -l -p=tw-nl1.pat,tw-other1.pat -i=af.utf8,fy.utf8 -f tw-nl2.txt | grep -v SHORT | grep -v '^tw-nl' | grep -v '^nl' | wc -l
  38. # 855, 836
  39.  
  40. echo
  41.  
  42. echo Overige tweets:
  43. wc -l tw-other2.txt
  44.  
  45. echo False positives originele textcat:
  46. ./oldtextcat < tw-other2.txt | grep '^\[dutch\]' | wc -l
  47. # 545, 569
  48.  
  49. echo False positives standaard:
  50. textcat -l -i=af.utf8,fy.utf8 -f tw-other2.txt | grep '^nl' | wc -l
  51. # 1262, 1277
  52.  
  53. echo False positives tweetcat:
  54. textcat -l -p=tw-nl1.pat,tw-other1.pat -i=nl.utf8,af.utf8,fy.utf8 -f tw-other2.txt | grep -E '^(tw-)?nl' | wc -l
  55. # 330, 388
  56.  
  57. echo False positives gecombineerd:
  58. textcat -l -p=tw-nl1.pat,tw-other1.pat -i=af.utf8,fy.utf8 -f tw-other2.txt | grep -E '^(tw-)?nl' | wc -l
  59. # 326, 389
  60.  
  61. ################################################################
  62.  
  63. echo
  64.  
  65. wc -l unfiltered.txt
  66.  
  67. textpat < tw-nl.txt > tw-nl.pat
  68. textpat < tw-other.txt > tw-other.pat
  69.  
  70. ./oldtextcat < tw-nl.txt | grep -v ^SHORT | perl -p -e 's/.*? //' > tw-nl-unshort1.txt
  71.  
  72. ./oldtextcat < unfiltered.txt | grep '^\[dutch\]' | perl -p -e 's/.*? //' > tmp
  73. echo originele textcat, false negatives + 1
  74. diff -U0 tw-nl-unshort1.txt tmp | grep '^-' | wc -l
  75. # 4705
  76. echo originele textcat, false positives + 1
  77. diff -U0 tw-nl-unshort1.txt tmp | grep '^+' | wc -l
  78. # 5940
  79.  
  80. textcat -l -f tw-nl.txt | grep -v ^SHORT | perl -p -e 's/.*?\t//' > tw-nl-unshort2.txt
  81.  
  82. textcat -l -i=af.utf8,fy.utf8 -f unfiltered.txt | grep '^nl' | perl -p -e 's/.*?\t//' > tmp
  83. echo standaard, false negatives + 1
  84. diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
  85. # 1349
  86. echo standaard, false positives + 1
  87. diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
  88. # 2540
  89.  
  90. # tw-nl ...
  91. textcat -l -p=tw-nl.pat -i=nl.utf8,af.utf8,fy.utf8 -f unfiltered.txt | grep '^tw-nl' | perl -p -e 's/.*?\t//' > tmp
  92. echo tw-nl i.p.v. nl, false negatives + 1
  93. diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
  94. # 123
  95. echo tw-nl i.p.v. nl, false positives + 1
  96. diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
  97. # 36454
  98.  
  99. # tw-other nl ...
  100. textcat -l -p=tw-other.pat -i=af.utf8,fy.utf8 -f unfiltered.txt | grep '^nl' | perl -p -e 's/.*?\t//' > tmp
  101. echo tw-other + nl, false negatives + 1
  102. diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
  103. # 6277
  104. echo tw-other + nl, false positives + 1
  105. diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
  106. # 120
  107.  
  108. # tw-nl nl ...
  109. textcat -l -p=tw-nl.pat -i=af.utf8,fy.utf8 -f unfiltered.txt | grep -E '^(tw-)?nl' | perl -p -e 's/.*?\t//' > tmp
  110. echo tw-nl + nl, false negatives + 1
  111. diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
  112. # 111
  113. echo tw-nl + nl, false positives + 1
  114. diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
  115. # 36029
  116.  
  117. # tw-nl tw-other ...
  118. textcat -l -p=tw-nl.pat,tw-other.pat -i=nl.utf8,af.utf8,fy.utf8 -f unfiltered.txt | grep '^tw-nl' | perl -p -e 's/.*?\t//' > tmp
  119. echo tw-nl i.p.v. nl + tw-other, false negatives + 1
  120. diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
  121. # 1836
  122. echo tw-nl i.p.v. nl + tw-other, false positives + 1
  123. diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
  124. # 4879
  125.  
  126. # tw-nl nl tw-other ...
  127. textcat -l -p=tw-nl.pat,tw-other.pat -i=af.utf8,fy.utf8 -f unfiltered.txt | grep -E '^(tw-)?nl' | perl -p -e 's/.*?\t//' > tmp
  128. echo tw-nl + nl + tw-other, false negatives + 1
  129. diff -U0 tw-nl-unshort2.txt tmp | grep '^-' | wc -l
  130. # 1688
  131. echo tw-nl + nl + tw-other, false positives + 1
  132. diff -U0 tw-nl-unshort2.txt tmp | grep '^+' | wc -l
  133. # 4874
  134.  
  135. rm tmp

Raw Paste

Login or Register to edit or fork this paste. It's free.