-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathbasic_analysis.py
More file actions
146 lines (117 loc) · 5.86 KB
/
Copy pathbasic_analysis.py
File metadata and controls
146 lines (117 loc) · 5.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import nltk
from collections import Counter
import os, sys
reload(sys)
sys.setdefaultencoding('utf-8')
# remove trivial words
import string
# trivial words to filtered
stopwords_set = set(nltk.corpus.stopwords.words('english'))
stopwords_set.remove(u'again')
nums_set = set([])
for digit in string.digits:
nums_set.add(digit)
for digit1 in string.digits:
for digit2 in string.digits:
dd = digit1 + digit2
nums_set.add(dd)
extrawords_set = set(
[u'-', u'.', u'..', u'...', u'--', u'\'',
u'https', u'p.m', u'a.m', u'000',
u'co', u'ohio', u'carolina', u'florida', u'colorado', u'iowa', u'alabama', u'hampshire', u'michigan', u'virginia',
u'wisconsin', u'arizona', u'california', u'indiana', u'texas', u'washington', u'nevada',
u'charlottesville', u'cleveland', u'pennsylvania', u'phoenix', u'bedminster', u'gettysburg'])
filter_set = stopwords_set | extrawords_set | nums_set
# substitutes (only deal with 'word for word')
subsList = {'dems': 'democrates', 'ocare': 'healthcare', 'obamacare': 'healthcare', 'hcare': 'healthcare',
'tpp': 'agreement', 'wikileakes': 'leak', 'brexit': 'exit', 'wwi': 'war', 'wwii': 'war', 'nafta': 'agreement',
'hrc': 'hillary', 'hillarys': 'hillary', 'hillaryclinton': 'hillary',
'djt': 'trump', 'melania': 'trump', 'ivanka': 'trump', 'kushner': 'trump', 'donaldtrump': 'trump',
'dnc': 'democrates', 'usss': 'fbi', 'comey': 'fbi', 'doj': 'justice', 'nypd': 'police',
'crimea': 'ukraine', 'syrians': 'syrian',
'buzzfeed': 'media', 'foxconn': 'media', 'breitbart': 'media', 'instagram': 'media', 'amazonwashingtonpost': 'media', 'softbank': 'bank',
'deplorables': 'humble', 'cancelled': 'cancel', 'cancelling': 'cancel', 'judgement': 'judge', 'americanism': 'american'}
typoList = {'hereos': 'heroes', 'falwell': 'farewell', 'substantialy': 'substantially', 'amercan': 'american'}
subsList.update(typoList)
# Topic keywords
myTopics = list()
# Topic 1
myTopics.append(['hillary', 'clinton', 'crooked', 'bernie', 'obama', 'years', 'campaign', 'bad'])
# Topic 2
myTopics.append(['fake', 'news', 'dishonest', 'media', 'failing', 'big', 'story', 'said'])
# Topic 3
myTopics.append(['make', 'america', 'great', 'again', 'together', 'we', 'american', 'people'])
# Topic 4
myTopics.append(['white', 'house', 'great', 'day', 'honor', 'today', 'meeting', 'senator'])
# Topic 5
myTopics.append(['obamacare', 'repeal', 'replace', 'disaster', 'crazy', 'bill', 'failed', 'reform'])
# Topic 6
myTopics.append(['join', 'live', 'rally', 'tomorrow', 'tonight', 'tickets', 'today', 'speech'])
# Topic 7
myTopics.append(['korea', 'north', 'south', 'china', 'trade', 'deficit', 'problem', 'president'])
# Topic 8
myTopics.append(['law', 'enforcement', 'officers', 'police', 'executive', 'order', 'killed', 'victims'])
# Topic 9
myTopics.append(['trump', 'vote', 'poll', 'team', 'voters', 'final', 'americans', 'debate'])
# Topic 10
myTopics.append(['jobs', 'bring', 'back', 'dollars', 'optimism', 'economic', 'market', 'companies'])
# Topic 11
myTopics.append(['enjoy', 'interviewed', 'tonight', 'looking', 'forward', 'interview', 'prime', 'minister'])
# Topic 12
myTopics.append(['heroes', 'veterans', 'honor', 'today', 'act', 'announced', 'american', 'lives'])
# Topic 13
myTopics.append(['islamic', 'terror', 'radical', 'ban', 'tough', 'allowed', 'border', 'immigration'])
# Topic 14
myTopics.append(['fbi', 'russia', 'cia', 'emails', 'director', 'illegally', 'investigation', 'server'])
# Topic 15
myTopics.append(['mexico', 'wall', 'pay', 'trade', 'deficit', 'plant', 'crime', 'deal'])
# Topic 16
myTopics.append(['syria', 'ISIS', 'syrian', 'refugees', 'immigrants', 'putin', 'rebels', 'ceasefire'])
myTopicsNames = ['HRC', 'fakeNews', 'MAGA', 'whitehouse', 'healthcare', 'join', 'korea&china', 'police', 'vote', 'jobs',
'interview', 'veterans', 'terror', 'fbi', 'mexico', 'refugee']
def wordFilter(wordList, filterWords):
return [word for word in wordList if word not in filterWords]
def basic_analysis(normalWordbag, capitalWordbag, showWordCloud):
# filter the stopwords
normalWordbag = wordFilter(normalWordbag, filter_set)
capitalWordbag = wordFilter(capitalWordbag, filter_set)
print '%d non-stop words totally.' % len(normalWordbag)
# count the word occurrence
normalCounter = Counter(normalWordbag)
print '%d non-repeative words.' % len(normalCounter)
print "Most 30 common words:"
print normalCounter.most_common(30)
#print "ALL CAPITAL:"
capitalCounter = Counter(capitalWordbag)
#print capitalCounter.most_common(200)
# plot the word clouds
if showWordCloud:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from scipy.misc import imread
def wordlist2string(wordlist):
s = ''
for word in wordlist:
s += word + ' '
return s
normalStringAll = wordlist2string(normalWordbag)
capitalStringAll = wordlist2string(capitalWordbag).upper()
# read the mask / color image
d = os.path.dirname(__file__)
trump_coloring = imread(os.path.join(d, "trump3.jpg"))
wc = WordCloud(background_color="white", mask=trump_coloring, stopwords=STOPWORDS, random_state=1)
# create coloring from image
image_colors = ImageColorGenerator(trump_coloring)
plt.figure(0)
wordcloud = wc.generate(normalStringAll)
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
#plt.imshow(trump_coloring, cmap=plt.cm.gray)
plt.axis('off')
plt.figure(1)
wordcloud = wc.generate(capitalStringAll)
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
plt.axis('off')
plt.show()
# save img
#wc.to_file(os.path.join(d, "cloudimg.png"))
return