The following Python script was created to count keyword occurrences in a string. It expects a tab delimited text file with the first column being the ID, second column being the text (status), and third column is the date. It outputs a tab delimited file with the ID, date and for each of the term - 0 if not in string, 1 if term is found in the string.
Arguments
Command line
python scriptfile.py inputfile.txt outputfile.txt terms.txt
import string, os, sys
import operator, math
if __name__ == '__main__':
TermDict={}
TermTup=[]
TermFile=open(sys.argv[3], "r")
line = TermFile.readline()
while line:
term=line.strip().lower()
TermDict[term]="0"
TermTup.append(term)
line = TermFile.readline()
#endwhile
TermFile.close()
outFile=open(sys.argv[2], "w")
outLine='mpid\tdate\t'
for i in range(0, len(TermTup)):
outLine=outLine+TermTup[i]+'\t'
#end
outLine=outLine+'\n'
outFile.write(outLine)
inFile=open(sys.argv[1], "r")
line = inFile.readline()
while line:
mpid=line[:line.find("\t")]
print "mpid: "+mpid
line=line[line.find("\t")+1 : ]
status=line[:line.find("\t")]
line=line[line.find("\t")+1 : ]
date=line.strip()
status=status.lower()
for term in TermDict:
if string.find(status, term)>-1:
TermDict[term]="1"
#endif
#endfor
outLine=mpid+'\t'+date+'\t'
for i in range(0, len(TermTup)):
outLine=outLine+TermDict[TermTup[i]]+'\t'
if TermDict[TermTup[i]]=="1":
TermDict[TermTup[i]]="0"
#endif
#endfor
outLine=outLine+'\n'
outFile.write(outLine)
line = inFile.readline()
#endwhile
inFile.close()
outFile.close()
Big thanks to Yaron Charka for help with this script.