Skip to content

pull datasciencecourseraUW #33

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions assignment1/frequency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys
import json
import re

def getTweet(fp):
tweet_file = open(fp) #Open the file that contains tweets
tweet_list = []
for tweet in tweet_file:
decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
if "text" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
text = decoded_tweet["text"].encode('utf-8') #Select the text of the tweet, as in a dictionary, and encode to get proper international characters.
tweet_list.append(text) # Add each text to the tweet list
return tweet_list


def frequency(tweet_list):
wordsDic = {} #Initiate a dictionary for all the words
total_count = 0
for tweet in tweet_list: # for each tweet selected above
tweet_words = re.findall(r"[\w']+",tweet,re.IGNORECASE) #Split each tweet to retain only the words

for word in tweet_words: #For each word selected
total_count += 1 #Add one to the total count of words
if word in wordsDic: #If the word is in the dictionary
wordsDic[word] += 1 # Add one to the count for this word in the dictionary
else : #If the word is not in the dictionary
wordsDic[word] = 1 #Initiate the count for this word in the dictionary

for word in wordsDic:
wordsDic[word] = float(wordsDic[word]) / float(total_count)
print word + " " + str("%.5f" % round(wordsDic[word],5))


def main():
tweet_list = getTweet(sys.argv[1]) #Create the tweet list
frequency(tweet_list) #Score each tweet from the list

if __name__ == '__main__':
main()
184 changes: 184 additions & 0 deletions assignment1/happiest_state.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import sys
import json
import re

States = {
'AK': 'Alaska',
'AL': 'Alabama',
'AR': 'Arkansas',
'AS': 'American Samoa',
'AZ': 'Arizona',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DC': 'District of Columbia',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'GU': 'Guam',
'HI': 'Hawaii',
'IA': 'Iowa',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'MA': 'Massachusetts',
'MD': 'Maryland',
'ME': 'Maine',
'MI': 'Michigan',
'MN': 'Minnesota',
'MO': 'Missouri',
'MP': 'Northern Mariana Islands',
'MS': 'Mississippi',
'MT': 'Montana',
'NA': 'National',
'NC': 'North Carolina',
'ND': 'North Dakota',
'NE': 'Nebraska',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NV': 'Nevada',
'NY': 'New York',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'PR': 'Puerto Rico',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VA': 'Virginia',
'VI': 'Virgin Islands',
'VT': 'Vermont',
'WA': 'Washington',
'WI': 'Wisconsin',
'WV': 'West Virginia',
'WY': 'Wyoming'
}

States2 = {
'Alaska':'AK',
'Alabama':'AL',
'Arkansas':'AR',
'American Samoa':'AS',
'Arizona':'AZ',
'California':'CA',
'Colorado':'CO',
'Connecticut':'CT',
'District of Columbia':'DC',
'Delaware':'DE',
'Florida':'FL',
'Georgia':'GA',
'Guam':'GU',
'Hawaii':'HI',
'Iowa':'IA',
'Idaho':'ID',
'Illinois':'IL',
'Indiana':'IN',
'Kansas':'KS',
'Kentucky':'KY',
'Louisiana':'LA',
'Massachusetts':'MA',
'Maryland':'MD',
'Maine':'ME',
'Michigan':'MI',
'Minnesota':'MN',
'Missouri':'MO',
'Northern Mariana Islands':'MP',
'Mississippi':'MS',
'Montana':'MT',
'National':'NA',
'North Carolina':'NC',
'North Dakota':'ND',
'Nebraska':'NE',
'New Hampshire':'NH',
'New Jersey':'NJ',
'New Mexico':'NM',
'Nevada':'NV',
'New York':'NY',
'Ohio':'OH',
'Oklahoma':'OK',
'Oregon':'OR',
'Pennsylvania':'PA',
'Puerto Rico':'PR',
'Rhode Island':'RI',
'South Carolina':'SC',
'South Dakota':'SD',
'Tennessee':'TN',
'Texas':'TX',
'Utah':'UT',
'Virginia':'VA',
'Virgin Islands':'VI',
'Vermont':'VT',
'Washington':'WA',
'Wisconsin':'WI',
'West Virginia':'WV',
'Wyoming' :'WY'
}

def dict(fn):
sent_file = open(fn)
scores = {} # initialize an empty dictionary
for line in sent_file:
term, score = line.split("\t") # The file is tab-delimited. "\t" means "tab character"
scores[term] = int(score) # Convert the score to an integer.
return scores

def getTweet(fp):
tweet_file = open(fp) #Open the file that contains tweets
tweet_dic = {}
for tweet in tweet_file:
decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
if "place" in decoded_tweet:
place = decoded_tweet["place"]
if place is not None:
if place["country_code"] == "US":
if place["full_name"] is not None :
state0 = place["full_name"].split(",")[1].split()[0].encode('utf-8')
state1 = place["full_name"].split(",")[0].split()[0].encode('utf-8')
if state0 in States or state1 in States.values():
if state0 in States:
state = state0
else:
state = States2[state1]
if "text" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
text = decoded_tweet["text"].encode('utf-8') #Select the text of the tweet, as in a dictionary, and encode to get proper international characters.
tweet_dic[text] = state # Add each text to the tweet list
return tweet_dic


def sent(score_list, tweet_dic):
state_score = {}
for tweet in tweet_dic: # for each tweet selected above
tweet_score = 0 #Set the starting score to 0
tweet_words = re.findall(r"[\w']+",tweet,re.IGNORECASE) #Split each tweet to retain only the words

for word in tweet_words: #For each word selected
if word in score_list: #If the word is in the sentiment file dictionary
word_score = score_list[word] # get the word score from this dictionary
else : #If the word is not in the dictionary
word_score = 0 #Put the score of this word to 0
tweet_score += word_score #Sum scores of each word to get a total tweet score
if tweet_dic[tweet] in state_score:
state_score[tweet_dic[tweet]] += tweet_score
else:
state_score[tweet_dic[tweet]] = tweet_score

# finding the state with the max score
v = list(state_score.values())
k = list(state_score.keys())
print k[v.index(max(v))]

def main():
scores = dict(sys.argv[1]) #Create the dictionary from the sentiment file
tweet_list = getTweet(sys.argv[2]) #Create the tweet list
sent(scores, tweet_list) #Score each tweet from the list

if __name__ == '__main__':
main()
Binary file added assignment1/happiest_state.pyc
Binary file not shown.
10,171 changes: 10,171 additions & 0 deletions assignment1/output.txt

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions assignment1/problem_1_submission.txt

Large diffs are not rendered by default.

61 changes: 51 additions & 10 deletions assignment1/term_sentiment.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,58 @@
import sys
import json
import re

def hw():
print 'Hello, world!'
def dict(fn):
sent_file = open(fn)
scores = {} # initialize an empty dictionary
for line in sent_file:
term, score = line.split("\t") # The file is tab-delimited. "\t" means "tab character"
scores[term] = int(score) # Convert the score to an integer.
return scores

def lines(fp):
print str(len(fp.readlines()))
def getTweet(fp):
tweet_file = open(fp) #Open the file that contains tweets
tweet_list = []
for tweet in tweet_file:
decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
if "text" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
text = decoded_tweet["text"].encode('utf-8') #Select the text of the tweet, as in a dictionary, and encode to get proper international characters.
tweet_list.append(text) # Add each text to the tweet list
return tweet_list

def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
hw()
lines(sent_file)
lines(tweet_file)

def sent(score_list, tweet_list):
noterms = {} #Initiate new dictionary for new terms
global_scores = {} #Initiate new dictionary for tweet scores
for tweet in tweet_list: # for each tweet selected above
tweet_score = 0 #Set the starting score to 0
tweet_words = re.findall(r"[\w']+",tweet,re.IGNORECASE) #Split each tweet to retain only the words

for word in tweet_words: #For each word selected
if word in score_list: #If the word is in the sentiment file dictionary
word_score = score_list[word] # get the word score from this dictionary
else : #If the word is not in the dictionary
word_score = 0 #Put the score of this word to 0
noterms[word] = word_score #Add the new term to the dictionary of terms not found with its initial score of 0
tweet_score += word_score #Sum scores of each word to get a total tweet score
global_scores[tweet] = tweet_score #Put tweet scores in global dictionary

#Now we are going to check again all the tweets for the noterms occurances
for term in noterms : #Going through all noterms
occurance = 0
term_score = 0
for tweet in global_scores: #Going through all tweets
if term in tweet : #Check if the term is in the tweet
occurance +=1 #Add one to occurance of the term
term_score += global_scores[tweet] #Add the tweet score to the term score
noterms[term] = float(term_score)/float(occurance) #Divide total term score by total occurances
print term + " " + str("%.3f" % round(noterms[term],3))


def main():
scores = dict(sys.argv[1]) #Create the dictionary from the sentiment file
tweet_list = getTweet(sys.argv[2]) #Create the tweet list
sent(scores, tweet_list) #Score each tweet from the list

if __name__ == '__main__':
main()
Binary file added assignment1/term_sentiment.pyc
Binary file not shown.
44 changes: 44 additions & 0 deletions assignment1/top_ten.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import sys
import json
import re

def getHash(fp):
tweet_file = open(fp) #Open the file that contains tweets
hash_list = []
for tweet in tweet_file:
decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
if "entities" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
entities = decoded_tweet["entities"]
if 'hashtags' in entities:
hash = entities["hashtags"] #Select the hashtags of the tweet, as in a dictionary.
for hashT in hash:
h = hashT["text"].encode('utf-8')
hash_list.append(h) # Add each hash to the hash list
return hash_list


def frequency(hash_list):
hashDic = {} #Initiate a dictionary for all the hashtags
total_count = 0
for hash in hash_list: # for each hashtag selected above
total_count += 1 #Add one to the total count of hashtags
if hash in hashDic: #If the hashtag is in the dictionary
hashDic[hash] += 1 # Add one to the count for this hashtag in the dictionary
else : #If the hashtag is not in the dictionary
hashDic[hash] = 1 #Initiate the count for this hashtag in the dictionary

freq = []
for hash in hashDic:
freq.append((hash, hashDic[hash]))
freq = sorted(freq, key=lambda x: x[1], reverse = True)

for i in range(10):
print freq[i][0] + " " + str(float(freq[i][1]))


def main():
hash_list = getHash(sys.argv[1]) #Create the hash list
frequency(hash_list) #Frequency each hash from the list

if __name__ == '__main__':
main()
Binary file added assignment1/top_ten.pyc
Binary file not shown.
46 changes: 36 additions & 10 deletions assignment1/tweet_sentiment.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,43 @@
import sys
import json
import re

def hw():
print 'Hello, world!'
def dict(fn):
sent_file = open(fn)
scores = {} # initialize an empty dictionary
for line in sent_file:
term, score = line.split("\t") # The file is tab-delimited. "\t" means "tab character"
scores[term] = int(score) # Convert the score to an integer.
return scores

def lines(fp):
print str(len(fp.readlines()))
def getTweet(fp):
tweet_file = open(fp) #Open the file that contains tweets
tweet_list = []
for tweet in tweet_file:
decoded_tweet = json.loads(tweet) #Parse the data for every line in the tweet file.
if "text" in decoded_tweet : #Only retain lines that contain tweets,e.g. "text"
text = decoded_tweet["text"].encode('utf-8') #Select the text of the tweet, as in a dictionary, and encode to get proper international characters.
tweet_list.append(text) # Add each text to the tweet list
return tweet_list

def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
hw()
lines(sent_file)
lines(tweet_file)

def sent(score_list, tweet_list):
for tweet in tweet_list: # for each tweet selected above
tweet_score = 0 #Set the starting score to 0
tweet_words = re.findall(r"[\w']+",tweet,re.IGNORECASE) #Split each tweet to retain only the words

for word in tweet_words: #For each word selected
if word in score_list: #If the word is in the sentiment file dictionary
word_score = score_list[word] # get the word score from this dictionary
else : #If the word is not in the dictionary
word_score = 0 #Put the score of this word to 0
tweet_score += word_score #Sum scores of each word to get a total tweet score
print float(tweet_score)

def main():
scores = dict(sys.argv[1]) #Create the dictionary from the sentiment file
tweet_list = getTweet(sys.argv[2]) #Create the tweet list
sent(scores, tweet_list) #Score each tweet from the list

if __name__ == '__main__':
main()
Loading