uwescience · rdesannicolas · Nov 12, 2015 · Nov 12, 2015 · Nov 28, 2015 · Nov 30, 2015
diff --git a/assignment1/frequency.py b/assignment1/frequency.py
@@ -0,0 +1,39 @@
+import sys
+import json
+import re
+
+def getTweet(fp):
+	tweet_file = open(fp) #Open the file that contains tweets
+	tweet_list = []
+	for tweet in tweet_file:
+		decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
+		if "text" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
+			text = decoded_tweet["text"].encode('utf-8') #Select the text of the tweet, as in a dictionary, and encode to get proper international characters.
+			tweet_list.append(text) # Add each text to the tweet list
+	return tweet_list
+
+
+def frequency(tweet_list):
+  wordsDic = {} #Initiate a dictionary for all the words
+  total_count = 0
+  for tweet in tweet_list: # for each tweet selected above
+    tweet_words = re.findall(r"[\w']+",tweet,re.IGNORECASE) #Split each tweet to retain only the words
+
+    for word in tweet_words: #For each word selected
+	  total_count += 1 #Add one to the total count of words
+	  if word in wordsDic: #If the word is in the dictionary
+	    wordsDic[word] += 1 # Add one to the count for this word in the dictionary
+	  else : #If the word is not in the dictionary
+	    wordsDic[word] = 1 #Initiate the count for this word in the dictionary	    
+
+  for word in wordsDic:
+    wordsDic[word] = float(wordsDic[word]) / float(total_count)
+    print word + " " + str("%.5f" % round(wordsDic[word],5))
+
+
+def main():  
+    tweet_list = getTweet(sys.argv[1]) #Create the tweet list
+    frequency(tweet_list) #Score each tweet from the list
+
+if __name__ == '__main__':
+    main()
diff --git a/assignment1/happiest_state.py b/assignment1/happiest_state.py
@@ -0,0 +1,184 @@
+import sys
+import json
+import re
+
+States = {
+        'AK': 'Alaska',
+        'AL': 'Alabama',
+        'AR': 'Arkansas',
+        'AS': 'American Samoa',
+        'AZ': 'Arizona',
+        'CA': 'California',
+        'CO': 'Colorado',
+        'CT': 'Connecticut',
+        'DC': 'District of Columbia',
+        'DE': 'Delaware',
+        'FL': 'Florida',
+        'GA': 'Georgia',
+        'GU': 'Guam',
+        'HI': 'Hawaii',
+        'IA': 'Iowa',
+        'ID': 'Idaho',
+        'IL': 'Illinois',
+        'IN': 'Indiana',
+        'KS': 'Kansas',
+        'KY': 'Kentucky',
+        'LA': 'Louisiana',
+        'MA': 'Massachusetts',
+        'MD': 'Maryland',
+        'ME': 'Maine',
+        'MI': 'Michigan',
+        'MN': 'Minnesota',
+        'MO': 'Missouri',
+        'MP': 'Northern Mariana Islands',
+        'MS': 'Mississippi',
+        'MT': 'Montana',
+        'NA': 'National',
+        'NC': 'North Carolina',
+        'ND': 'North Dakota',
+        'NE': 'Nebraska',
+        'NH': 'New Hampshire',
+        'NJ': 'New Jersey',
+        'NM': 'New Mexico',
+        'NV': 'Nevada',
+        'NY': 'New York',
+        'OH': 'Ohio',
+        'OK': 'Oklahoma',
+        'OR': 'Oregon',
+        'PA': 'Pennsylvania',
+        'PR': 'Puerto Rico',
+        'RI': 'Rhode Island',
+        'SC': 'South Carolina',
+        'SD': 'South Dakota',
+        'TN': 'Tennessee',
+        'TX': 'Texas',
+        'UT': 'Utah',
+        'VA': 'Virginia',
+        'VI': 'Virgin Islands',
+        'VT': 'Vermont',
+        'WA': 'Washington',
+        'WI': 'Wisconsin',
+        'WV': 'West Virginia',
+        'WY': 'Wyoming'
+}
+
+States2 = {
+        'Alaska':'AK',
+        'Alabama':'AL',
+        'Arkansas':'AR',
+        'American Samoa':'AS',
+        'Arizona':'AZ',
+        'California':'CA',
+        'Colorado':'CO',
+        'Connecticut':'CT',
+        'District of Columbia':'DC',
+        'Delaware':'DE',
+        'Florida':'FL',
+        'Georgia':'GA',
+        'Guam':'GU',
+        'Hawaii':'HI',
+        'Iowa':'IA',
+        'Idaho':'ID',
+        'Illinois':'IL',
+        'Indiana':'IN',
+        'Kansas':'KS',
+        'Kentucky':'KY',
+        'Louisiana':'LA',
+        'Massachusetts':'MA',
+        'Maryland':'MD',
+        'Maine':'ME',
+        'Michigan':'MI',
+        'Minnesota':'MN',
+        'Missouri':'MO',
+        'Northern Mariana Islands':'MP',
+        'Mississippi':'MS',
+        'Montana':'MT',
+        'National':'NA',
+        'North Carolina':'NC',
+        'North Dakota':'ND',
+        'Nebraska':'NE',
+        'New Hampshire':'NH',
+        'New Jersey':'NJ',
+        'New Mexico':'NM',
+        'Nevada':'NV',
+        'New York':'NY',
+        'Ohio':'OH',
+        'Oklahoma':'OK',
+        'Oregon':'OR',
+        'Pennsylvania':'PA',
+        'Puerto Rico':'PR',
+        'Rhode Island':'RI',
+        'South Carolina':'SC',
+        'South Dakota':'SD',
+        'Tennessee':'TN',
+        'Texas':'TX',
+        'Utah':'UT',
+        'Virginia':'VA',
+        'Virgin Islands':'VI',
+        'Vermont':'VT',
+        'Washington':'WA',
+        'Wisconsin':'WI',
+        'West Virginia':'WV',
+        'Wyoming' :'WY'
+}
+
+def dict(fn):
+	sent_file = open(fn)
+	scores = {} # initialize an empty dictionary
+	for line in sent_file:
+  		term, score  = line.split("\t")  # The file is tab-delimited. "\t" means "tab character"
+  		scores[term] = int(score)  # Convert the score to an integer.
+	return scores 
+
+def getTweet(fp):
+	tweet_file = open(fp) #Open the file that contains tweets
+	tweet_dic = {}
+	for tweet in tweet_file:
+		decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
+		if "place" in decoded_tweet:
+		  place = decoded_tweet["place"]
+		  if place is not None:
+		    if place["country_code"] == "US":
+		      if place["full_name"] is not None :
+		        state0 = place["full_name"].split(",")[1].split()[0].encode('utf-8')
+		        state1 = place["full_name"].split(",")[0].split()[0].encode('utf-8')     
+		        if state0 in States or state1 in States.values():
+		          if state0 in States:
+		            state = state0
+		          else:
+		            state = States2[state1]
+		          if "text" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
+			        text = decoded_tweet["text"].encode('utf-8') #Select the text of the tweet, as in a dictionary, and encode to get proper international characters.
+			        tweet_dic[text] = state # Add each text to the tweet list
+	return tweet_dic
+
+
+def sent(score_list, tweet_dic):
+	state_score = {}
+	for tweet in tweet_dic: # for each tweet selected above
+		tweet_score = 0 #Set the starting score to 0
+		tweet_words = re.findall(r"[\w']+",tweet,re.IGNORECASE) #Split each tweet to retain only the words
+
+		for word in tweet_words: #For each word selected
+			if word in score_list: #If the word is in the sentiment file dictionary
+				word_score = score_list[word] # get the word score from this dictionary
+			else : #If the word is not in the dictionary
+				word_score = 0 #Put the score of this word to 0
+			tweet_score += word_score #Sum scores of each word to get a total tweet score
+		if tweet_dic[tweet] in state_score:
+		    state_score[tweet_dic[tweet]] += tweet_score
+		else:
+		    state_score[tweet_dic[tweet]] = tweet_score
+
+	# finding the state with the max score
+	v = list(state_score.values())
+	k = list(state_score.keys())
+	print k[v.index(max(v))]
+
+def main():  
+    scores = dict(sys.argv[1]) #Create the dictionary from the sentiment file
+    tweet_list = getTweet(sys.argv[2]) #Create the tweet list
+    sent(scores, tweet_list) #Score each tweet from the list
+
+if __name__ == '__main__':
+    main()
diff --git a/assignment1/happiest_state.pyc b/assignment1/happiest_state.pyc
diff --git a/assignment1/output.txt b/assignment1/output.txt
diff --git a/assignment1/problem_1_submission.txt b/assignment1/problem_1_submission.txt
diff --git a/assignment1/term_sentiment.py b/assignment1/term_sentiment.py
@@ -1,17 +1,58 @@
 import sys
+import json
+import re
 
-def hw():
-    print 'Hello, world!'
+def dict(fn):
+	sent_file = open(fn)
+	scores = {} # initialize an empty dictionary
+	for line in sent_file:
+  		term, score  = line.split("\t")  # The file is tab-delimited. "\t" means "tab character"
+  		scores[term] = int(score)  # Convert the score to an integer.
+	return scores 
 
-def lines(fp):
-    print str(len(fp.readlines()))
+def getTweet(fp):
+	tweet_file = open(fp) #Open the file that contains tweets
+	tweet_list = []
+	for tweet in tweet_file:
+		decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
+		if "text" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
+			text = decoded_tweet["text"].encode('utf-8') #Select the text of the tweet, as in a dictionary, and encode to get proper international characters.
+			tweet_list.append(text) # Add each text to the tweet list
+	return tweet_list
 
-def main():
-    sent_file = open(sys.argv[1])
-    tweet_file = open(sys.argv[2])
-    hw()
-    lines(sent_file)
-    lines(tweet_file)
+
+def sent(score_list, tweet_list):
+	noterms = {} #Initiate new dictionary for new terms
+	global_scores = {} #Initiate new dictionary for tweet scores
+	for tweet in tweet_list: # for each tweet selected above
+		tweet_score = 0 #Set the starting score to 0
+		tweet_words = re.findall(r"[\w']+",tweet,re.IGNORECASE) #Split each tweet to retain only the words
+
+		for word in tweet_words: #For each word selected
+			if word in score_list: #If the word is in the sentiment file dictionary
+				word_score = score_list[word] # get the word score from this dictionary
+			else : #If the word is not in the dictionary
+				word_score = 0 #Put the score of this word to 0
+				noterms[word] = word_score #Add the new term to the dictionary of terms not found with its initial score of 0
+			tweet_score += word_score #Sum scores of each word to get a total tweet score
+		global_scores[tweet] = tweet_score #Put tweet scores in global dictionary
+
+	#Now we are going to check again all the tweets for the noterms occurances
+	for term in noterms : #Going through all noterms
+	  occurance = 0
+	  term_score = 0
+	  for tweet in global_scores: #Going through all tweets
+	    if term in tweet : #Check if the term is in the tweet
+	      occurance +=1 #Add one to occurance of the term
+	      term_score += global_scores[tweet] #Add the tweet score to the term score
+	  noterms[term] = float(term_score)/float(occurance) #Divide total term score by total occurances
+	  print term + " " + str("%.3f" % round(noterms[term],3)) 
+
+
+def main():  
+    scores = dict(sys.argv[1]) #Create the dictionary from the sentiment file
+    tweet_list = getTweet(sys.argv[2]) #Create the tweet list
+    sent(scores, tweet_list) #Score each tweet from the list
 
 if __name__ == '__main__':
     main()
diff --git a/assignment1/term_sentiment.pyc b/assignment1/term_sentiment.pyc
diff --git a/assignment1/top_ten.py b/assignment1/top_ten.py
@@ -0,0 +1,44 @@
+import sys
+import json
+import re
+
+def getHash(fp):
+	tweet_file = open(fp) #Open the file that contains tweets
+	hash_list = []
+	for tweet in tweet_file:
+		decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
+		if "entities" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
+			entities = decoded_tweet["entities"]
+			if 'hashtags' in entities:
+			    hash = entities["hashtags"] #Select the hashtags of the tweet, as in a dictionary.
+			    for hashT in hash:
+			        h = hashT["text"].encode('utf-8')
+			        hash_list.append(h) # Add each hash to the hash list
+	return hash_list
+
+
+def frequency(hash_list):
+  hashDic = {} #Initiate a dictionary for all the hashtags
+  total_count = 0
+  for hash in hash_list: # for each hashtag selected above
+	  total_count += 1 #Add one to the total count of hashtags
+	  if hash in hashDic: #If the hashtag is in the dictionary
+	    hashDic[hash] += 1 # Add one to the count for this hashtag in the dictionary
+	  else : #If the hashtag is not in the dictionary
+	    hashDic[hash] = 1 #Initiate the count for this hashtag in the dictionary	    
+
+  freq = []
+  for hash in hashDic:
+    freq.append((hash, hashDic[hash]))
+    freq = sorted(freq, key=lambda x: x[1], reverse = True)
+
+  for i in range(10):
+    print freq[i][0] + " " + str(float(freq[i][1]))
+
+
+def main():  
+    hash_list = getHash(sys.argv[1]) #Create the hash list
+    frequency(hash_list) #Frequency each hash from the list
+
+if __name__ == '__main__':
+    main()
diff --git a/assignment1/top_ten.pyc b/assignment1/top_ten.pyc
diff --git a/assignment1/tweet_sentiment.py b/assignment1/tweet_sentiment.py
@@ -1,17 +1,43 @@
 import sys
+import json
+import re
 
-def hw():
-    print 'Hello, world!'
+def dict(fn):
+	sent_file = open(fn)
+	scores = {} # initialize an empty dictionary
+	for line in sent_file:
+  		term, score  = line.split("\t")  # The file is tab-delimited. "\t" means "tab character"
+  		scores[term] = int(score)  # Convert the score to an integer.
+	return scores 
 
-def lines(fp):
-    print str(len(fp.readlines()))
+def getTweet(fp):
+	tweet_file = open(fp) #Open the file that contains tweets
+	tweet_list = []
+	for tweet in tweet_file:
+		decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
+		if "text" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
+			text = decoded_tweet["text"].encode('utf-8') #Select the text of the tweet, as in a dictionary, and encode to get proper international characters.
+			tweet_list.append(text) # Add each text to the tweet list
+	return tweet_list
 
-def main():
-    sent_file = open(sys.argv[1])
-    tweet_file = open(sys.argv[2])
-    hw()
-    lines(sent_file)
-    lines(tweet_file)
+
+def sent(score_list, tweet_list):
+	for tweet in tweet_list: # for each tweet selected above
+		tweet_score = 0 #Set the starting score to 0
+		tweet_words = re.findall(r"[\w']+",tweet,re.IGNORECASE) #Split each tweet to retain only the words
+
+		for word in tweet_words: #For each word selected
+			if word in score_list: #If the word is in the sentiment file dictionary
+				word_score = score_list[word] # get the word score from this dictionary
+			else : #If the word is not in the dictionary
+				word_score = 0 #Put the score of this word to 0
+			tweet_score += word_score #Sum scores of each word to get a total tweet score
+		print float(tweet_score)
+
+def main():  
+    scores = dict(sys.argv[1]) #Create the dictionary from the sentiment file
+    tweet_list = getTweet(sys.argv[2]) #Create the tweet list
+    sent(scores, tweet_list) #Score each tweet from the list
 
 if __name__ == '__main__':
     main()