I'm having trouble getting my slack bots news feeder to ban words, the trigger words are working fine, but its letting banned words through but it catches them because it prints out found banned word: xxxxx. I'm new to python and I just don't get what's up.
triggers = ['SEC', 'CSA', 'OSC', 'CFTC', 'CME', 'CBOE', 'AMD', 'Intel', 'Nvidia',
'Bitcoin', 'blockchain', 'Apple', 'Amazon', 'Google', 'Microsoft',
'commerce', 'business', 'law', 'legal', 'financial', 'hack', 'hacked',
'chains', 'chairman', 'CEO', 'board', 'bank']
banned = ['technical', 'analysis', 'bearish', 'bullish', 'trading', 'trade', 'opinion',
'sponsored', 'price', 'watch']
def feedparsecheck(url):
feed = feedparser.parse(url)
feed_title = feed['feed']['title']
feed_entries = feed.entries
database()
print "feed 30 min"
for entry in feed.entries:
article_title = entry.title
article_link = entry.link
for trig in triggers:
if trig.lower() in article_title.lower():#trigger
for ban in banned:
if ban.lower() not in article_title.lower():#banned
response = "%s\n%s\n" % (article_title, article_link)
article_link = str(article_link.strip())
if not in_database(article_link):
update_database(article_link)
#print article_link
slack_client.api_call("chat.postMessage", channel=NEWS, text=''.join(response), as_user=True)
else:
print "found banned word:- " + ban
You are iterating over all triggers and all banned words for all articles, which means that every article will get sent to your channel for every banned word that's not in the title * number of triggers in the title.
Example:
Bitcoin trading for lower price after hack
Will get sent to your channel 16 times. 2 triggers (Bitcoin, hack) * 8 banned words not in title (10 - len(trading, price)) = 16.
To fix:
title_lower = article_title.lower()
if any(trig.lower() in title_lower for trig in triggers):
if any(ban.lower() in title_lower for ban in banned):
print 'found banned word:- '+ ban
else:
# post to slack channel
Related
I have a list of 3800 names I want to remove from 750K sentences.
The names can contain multiple words such as "The White Stripes".
Some names might also be look like a subset of a larger name, ex: 'Ame' may be one name and 'Amelie' may be another.
This is what my current implementation looks like:
def find_whole_word(w):
return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
names_lowercase = ['the white stripes', 'the beatles', 'slayer', 'ame', 'amelie'] # 3800+ names
def strip_names(sentence: str):
token = sentence.lower()
has_name = False
matches = []
for name in names_lowercase:
match = find_whole_word(name)(token)
if match:
matches.append(match)
def get_match(match):
return match.group(1)
matched_strings = list(map(get_match, matches))
matched_strings.sort(key=len, reverse=True)
for matched_string in matched_strings:
# strip names at the start, end and when they occur in the middle of text (with whitespace around)
token = re.sub(rf"(?<!\S){matched_string}(?!\S)", "", token)
return token
sentences = [
"how now brown cow",
"die hard fan of slayer",
"the white stripes kill",
"besides slayer I believe the white stripes are the best",
"who let ame out",
"amelie has got to go"
] # 750K+ sentences
filtered_list = [strip_names(sentence) for sentence in sentences]
# Expected: filtered_list = ["how now brown cow", "die hard fan of ", " kill", "besides I believe are the best", "who let out", " has got to go"]
My current implementation takes several hours. I don't care about readability as this code won't be used for long.
Any suggestions on how I can increase the run time?
My previous solution was overkill.
All I really had to do was use the word boundary \b as described in the documentation.
Usage example: https://regex101.com/r/2CZ8el/1
import re
names_joined = "|".join(names_lowercase)
names_whole_words_filter_expression = re.compile(rf"\b({names_joined})\b", flags=re.IGNORECASE)
def strip_names(text: str):
return re.sub(names_whole_words_filter_expression, "", text).strip()
Now it takes a few minutes instead of a few hours 🙌
I have written a piece of code to parse the action items from a troubleshooting doc.
I want to extract phrases that start with a verb and end with a noun.
It was working as expected earlier (a month ago). But on running against the same input as earlier, its missing some action items that it was catching previously.
I haven't changed the code. Has something changed from nltk or punkt side that may be affecting my results?
Please help me figure what needs to be changed to make it run as earlier.
import re
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.tokenize import word_tokenize
#One time downloads
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
custom_sent_tokenizer = PunktSentenceTokenizer()
def process_content(x):
try:
#sent_tag = []
act_item = []
for i in x:
print('tokenized = ',i)
words = nltk.word_tokenize(i)
print(words)
tagged = nltk.pos_tag(words)
print('tagged = ',tagged)
#sent_tag.append(tagged)
#print('sent= ',sent_tag)
#chunking
chunkGram = r"""ActionItems: {<VB.>+<JJ.|CD|VB.|,|CC|NN.|IN|DT>*<NN|NN.>+}"""
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)
print(chunked)
for subtree in chunked.subtrees(filter=lambda t: t.label() == 'ActionItems'):
print('Filtered chunks= ',subtree)
ActionItems = ' '.join([w for w, t in subtree.leaves()])
act_item.append(ActionItems)
chunked.draw()
return act_item
except Exception as e:
#print(str(e))
return str(e)
res = 'replaced rev 6 aeb with a rev 7 aeb. configured new board and regained activity. tuned, flooded and calibrated camera. scanned fi rst patient with no issues. made new backups. replaced aeb board and completed setup. however, det 2 st ill not showing any counts. performed all necessary tests and the y passed . worked with tech support to try and resolve the issue. we decided to order another board due to lower rev received. camera is st ill down.'
tokenized = custom_sent_tokenizer.tokenize(res)
tag = process_content(tokenized)
With the input as shared in the code, earlier, the following action items were being parsed:
['replaced rev 6 aeb', 'configured new board', 'regained activity', 'tuned , flooded and calibrated camera', 'scanned fi rst patient', 'made new backups', 'replaced aeb board', 'completed setup', 'det 2 st ill', 'showing any counts', 'performed all necessary tests and the y', 'worked with tech support']
But now, only these are coming up:
['regained activity', 'tuned , flooded and calibrated camera', 'completed setup', 'det 2 st ill', 'showing any counts']
I finally resolved this by replacing JJ. with JJ|JJR|JJS
So my chunk is defined as :
chunkGram = r"""ActionItems: {<VB.>+<JJ|JJR|JJS|CD|NN.|CC|IN|VB.|,|DT>*<NN|NN.>+}"""
I dont understand this change in behavior.
Dot (.) was a really good way of using all modifiers on a POS
I'm working on a text-mining use case in python. These are the sentences of interest:
As a result may continue to be adversely impacted, by fluctuations in foreign currency exchange rates. Certain events such as the threat of additional tariffs on imported consumer goods from China, have increased. Stores are primarily located in shopping malls and other shopping centers.
How can I extract the sentence with the keyword "China"? I do need a sentence before and after that, actually atleast two sentences before and after.
I've tried the below, as was answered here:
import nltk
from nltk.tokenize import word_tokenize
sents = nltk.sent_tokenize(text)
my_sentences = [sent for sent in sents if 'China' in word_tokenize(sent)]
Please help!
TL;DR
Use sent_tokenize, keep track of the index where the focus word and window the sentences to get the desired result.
from itertools import chain
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
word_detokenize = TreebankWordDetokenizer().detokenize
text = """As a result may continue to be adversely impacted, by fluctuations in foreign currency exchange rates. Certain events such as the threat of additional tariffs on imported consumer goods from China, have increased global economic and political uncertainty and caused volatility in foreign currency exchange rates. Stores are primarily located in shopping malls and other shopping centers, certain of which have been experiencing declines in customer traffic."""
tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(text)]
sent_idx_with_china = [idx for idx, sent in enumerate(tokenized_text)
if 'China' in sent or 'china' in sent]
window = 2 # If you want 2 sentences before and after.
for idx in sent_idx_with_china:
start = max(idx - window, 0)
end = min(idx+window, len(tokenized_text))
result = ' '.join(word_detokenize(sent) for sent in tokenized_text[start:end])
print(result)
Another example, pip install wikipedia first:
from itertools import chain
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
word_detokenize = TreebankWordDetokenizer().detokenize
import wikipedia
text = wikipedia.page("Winnie The Pooh").content
tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(text)]
sent_idx_with_china = [idx for idx, sent in enumerate(tokenized_text)
if 'China' in sent or 'china' in sent]
window = 2 # If you want 2 sentences before and after.
for idx in sent_idx_with_china:
start = max(idx - window, 0)
end = min(idx+window, len(tokenized_text))
result = ' '.join(word_detokenize(sent) for sent in tokenized_text[start:end])
print(result)
print()
[out]:
Ashdown Forest in England where the Pooh stories are set is a popular
tourist attraction, and includes the wooden Pooh Bridge where Pooh and
Piglet invented Poohsticks. The Oxford University Winnie the Pooh
Society was founded by undergraduates in 1982. == Censorship in China
== In the People's Republic of China, images of Pooh were censored in mid-2017 from social media websites, when internet memes comparing
Chinese president Xi Jinping to Pooh became popular. The 2018 film
Christopher Robin was also denied a Chinese release.
I have been collecting tweets from the past week to collect the past-7-days tweets related to "lung cancer", yesterday, I figured I needed to start collecting more fields, so I added some fields and started re-collecting the same period of Tweets related to "lung cancer" from last week. The problem is, the first time I've collected ~2000 tweets related to lung cancer on 18th, Sept 2014. But last night, it only gave ~300 tweets, when I looked at the time of the tweets for this new set, it's only collecting tweets from something like ~23:29 to 23:59 on 18th Sept 2014. A large chunk of data is obviously missing. I don't think it's something with my code (below), I have tested various ways including deleting most of the fields to be collected and the time of data is still cut off prematurely.
Is this a known issue with Twitter API (when collecting last 7 days' data)? If so, it will be pretty horrible if someone is trying to do serious research. Or is it somewhere in my code that caused this (note: it runs perfectly fine for other previous/subsequent dates)?
import tweepy
import time
import csv
ckey = ""
csecret = ""
atoken = ""
asecret = ""
OAUTH_KEYS = {'consumer_key':ckey, 'consumer_secret':csecret,
'access_token_key':atoken, 'access_token_secret':asecret}
auth = tweepy.OAuthHandler(OAUTH_KEYS['consumer_key'], OAUTH_KEYS['consumer_secret'])
api = tweepy.API(auth)
# Stream the first "xxx" tweets related to "car", then filter out the ones without geo-enabled
# Reference of search (q) operator: https://dev.twitter.com/rest/public/search
# Common parameters: Changeable only here
startSince = '2014-09-18'
endUntil = '2014-09-20'
suffix = '_18SEP2014.csv'
############################
### Lung cancer starts #####
searchTerms2 = '"lung cancer" OR "lung cancers" OR "lungcancer" OR "lungcancers" OR \
"lung tumor" OR "lungtumor" OR "lung tumors" OR "lungtumors" OR "lung neoplasm"'
# Items from 0 to 500,000 (which *should* cover all tweets)
# Increase by 4,000 for each cycle (because 5000-6000 is over the Twitter rate limit)
# Then wait for 20 min before next request (becaues twitter request wait time is 15min)
counter2 = 0
for tweet in tweepy.Cursor(api.search, q=searchTerms2,
since=startSince, until=endUntil).items(999999999): # changeable here
try:
'''
print "Name:", tweet.author.name.encode('utf8')
print "Screen-name:", tweet.author.screen_name.encode('utf8')
print "Tweet created:", tweet.created_at'''
placeHolder = []
placeHolder.append(tweet.author.name.encode('utf8'))
placeHolder.append(tweet.author.screen_name.encode('utf8'))
placeHolder.append(tweet.created_at)
prefix = 'TweetData_lungCancer'
wholeFileName = prefix + suffix
with open(wholeFileName, "ab") as f: # changeable here
writeFile = csv.writer(f)
writeFile.writerow(placeHolder)
counter2 += 1
if counter2 == 4000:
time.sleep(60*20) # wait for 20 min everytime 4,000 tweets are extracted
counter2 = 0
continue
except tweepy.TweepError:
time.sleep(60*20)
continue
except IOError:
time.sleep(60*2.5)
continue
except StopIteration:
break
Update:
I have since tried running the same python scripts on a different computer (which is faster and more powerful than my home laptop). And the latter resulted in the expected number of tweets, I don't know why it's happening as my home laptop works fine for many programs, but I think we could rest the case and rule out the potential issues related to the scripts or Twitter API.
If you want to collect more data, I would highly recommend the streaming api that Tweepy has to offer. It has a much higher rate limit, in fact I was able to collect 500,000 tweets in just one day.
Also your rate limit checking is not very robust, you don't know for sure that Twitter will allow you to access 4000 tweets. From experience, I found that the more often you hit the rate limit the fewer tweets you are allowed and the longer you have to wait.
I would recommend using:
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
so that your application will not exceed the rate limit, alternatively you should check what you have used with:
print (api.rate_limit_status())
and then you can just sleep the thread like you have done.
Also your end date is incorrect. The end date should be '2014-09-21', one higher than whatever todays date is.
I am having quite a bit of technical issues. My python script below usually works (when time is in yyyy-mm-dd' format. But during the extremely heavy tweet activities, for example more than 500,000 tweets collected a day, my computer runs out of memory and have to force stop the program.
I can work around by looking at the time of the last tweets in the stopped csv file, in this case it's at time 18:44:00. I have tried many time format (for example 'yyyy-mm-dd hh:mm:ss' format as below) but none actually works.
import tweepy
import time
import csv
ckey = ""
csecret = ""
atoken = ""
asecret = ""
OAUTH_KEYS = {'consumer_key':ckey, 'consumer_secret':csecret,
'access_token_key':atoken, 'access_token_secret':asecret}
auth = tweepy.OAuthHandler(OAUTH_KEYS['consumer_key'], OAUTH_KEYS['consumer_secret'])
api = tweepy.API(auth)
# Stream the first "xxx" tweets related to "car", then filter out the ones without geo-enabled
# Reference of search (q) operator: https://dev.twitter.com/rest/public/search
# Common parameters: Changeable only here
startSince = '2014-09-18 00:00:00'
endUntil = '2014-09-18 18:44:00'
suffix = '_18SEP2014.csv'
############################
### Lung cancer starts #####
searchTerms2 = '"lung cancer" OR "lung cancers" OR "lungcancer" OR "lungcancers" OR \
"lung tumor" OR "lungtumor" OR "lung tumors" OR "lungtumors" OR "lung neoplasm"'
# Items from 0 to 500,000 (which *should* cover all tweets)
# Increase by 4,000 for each cycle (because 5000-6000 is over the Twitter rate limit)
# Then wait for 20 min before next request (becaues twitter request wait time is 15min)
counter2 = 0
for tweet in tweepy.Cursor(api.search, q=searchTerms2,
since=startSince, until=endUntil).items(999999999): # changeable here
try:
'''
print "Name:", tweet.author.name.encode('utf8')
print "Screen-name:", tweet.author.screen_name.encode('utf8')
print "Tweet created:", tweet.created_at'''
placeHolder = []
placeHolder.append(tweet.author.name.encode('utf8'))
placeHolder.append(tweet.author.screen_name.encode('utf8'))
placeHolder.append(tweet.created_at)
prefix = 'TweetData_lungCancer'
wholeFileName = prefix + suffix
with open(wholeFileName, "ab") as f: # changeable here
writeFile = csv.writer(f)
writeFile.writerow(placeHolder)
counter2 += 1
if counter2 == 4000:
time.sleep(60*20) # wait for 20 min everytime 4,000 tweets are extracted
counter2 = 0
continue
except tweepy.TweepError:
time.sleep(60*20)
continue
except IOError:
time.sleep(60*2.5)
continue
except StopIteration:
break