Sentiment analysis using TextBlob for Dutch language - python-2.7

I want to do sentiment analysis of a list of tweets fetched based on a particular keyword. The tweets coming in are mostly in dutch language and TextBlob needs them converted to English in order to compute the tweet's polarity and subjectivity value. How can I convert the tweet to English language? I basically need a FREE API to do the translation. Having trouble using the MS Bing Translator. I have tried using goslate , langdetect , translate and translation libraries but none of them worked. Here's the code that I am using :
#!/usr/bin/env python
import tweepy
import goslate
from langdetect import detect
from translation import baidu, google, youdao, iciba
from translate import Translator
import os
import time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
t=time.time()
#karan's api keys
consumer_key = 'xxx'
consumer_secret = 'xxx'
access_key = 'xxx'
access_secret = 'xxx'
gs=goslate.Goslate()
translator= Translator(to_lang="en")
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
search_results = api.search(q="football", count=2, geocode="52.132633,5.2912659999999505,300km")
f=open('tweets_football.txt','wb')
for i in range(0,len(search_results)):
try:
print search_results[i].text
print search_results[i].id
print search_results[i].user.screen_name
trans=search_results[i].text
#print(gs.translate(trans,'en'))
print(translator.translate(trans))
if search_results[i].text not in search_results:
f.write(search_results[i].text)
f.write("\n")
print "Written to file!"
except Exception as e:
print str(e)
f.close()
print time.time()-t
Please point me in the right direction. If there's an easier method for this process, please suggest that as well. Thanks in advance.

You can try this code:
from translate import Translator
from googletrans import Translator
text = 'hallo_allemaal'
entext = Translator().translate(text, src='nl',dest='en').text
print(entext)
Google API has some limitations of number of hits from one IP. if you get that error. Check out for that too.

There is a Ducth package for textblob as textblob-nl. You can install it via pip install textblob
Official sample:
from textblob import TextBlob
from textblob_nl import PatternTagger, PatternAnalyzer
text = u"De kat wil wel vis eten maar geen poot nat maken."
blob = TextBlob(text, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
blob.sentiment
>>> (-0.1, 0.4)
You can find more information: https://github.com/gvisniuc/textblob-nl

Related

Python 2.7 Selenium unable to extract data

I am trying to extra data by return error
NoSuchElementException: Message: u'Unable to locate element: {"method":"xpath","selector":"//*[#id=\'searchpopbox\']"}' ; Stacktrace:
at FirefoxDriver.findElementInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8444)
at FirefoxDriver.findElement (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/driver_component.js:8453)
at DelayedCommand.executeInternal_/h (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10456)
at DelayedCommand.executeInternal_ (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10461)
at DelayedCommand.execute/< (file:///tmp/tmpjVcHQR/extensions/fxdriver#googlecode.com/components/command_processor.js:10401)
My code is as below and I am trying to get the list from the link
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
browser = webdriver.Firefox(profile)
url = 'https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax'
browser.get(url)
time.sleep(15)
a = browser.find_element_by_xpath("//*[#id='searchpopbox']")
print a
I am seeking your help to get the right xpath for the url.
This gets all the listing for that table.
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.bursamarketplace.com/index.php?tpl=th001_search_ajax")
time.sleep(15)
a = driver.find_element_by_xpath("//*[#id='searchpopbox']")
print(a.text)
Or without chromedrivermanager same thing applies to firefox
.Chrome(executable_path='absolutepathofchromedriver.exe')

Wolframalpha in python with GTTS

I am trying to make a Friday like virtual assistant using this code
import os
from gtts import gTTS
import time
import playsound
import speech_recognition as sr
while True:
def speak(text):
tts = gTTS(text=text, lang="en")
filename = "voice.mp3"
tts.save(filename)
playsound.playsound(filename)
def get_audio():
r = sr.Recognizer()
with sr.Microphone() as source:
audio = r.listen(source)
said = ""
try:
said = r.recognize_google(audio)
print(said)
except Exception as e:
print("Exception: " + str(e))
return said
text = get_audio()
if "who are you" in text:
speak(" I am Monday the virtual assistant")
And i was wondering how to put wolfram alpha in it so i would, say search for ..., then it would speak the answer from wolfram alpha.
Any help would be amazing :)
Install wolframalpha
Then add the following to your code:
import wolframalpha
if 'search for ' in text:
text = text.replace("search for ", "")
client = wolframalpha.Client(app_id)
res = client.query(text)
print(next(res.results).text)
speak(next(res.results).text)
To use the API, you have to go to the homepage, sign up for an account, create an app and get an app id.
To avoid getting any errors, keep the indentation in your 'speak' function uniform.

How can I add my web scrape process using bs4 to selenium automation in Python to make it one single process which just asks for a zipcode?

I am using selenium to go to a website and then go to the search button type a zipcode which I am entering beforehand and then for that zip code I want the link that the webpage has to feed my web scraper created using beautiful soup and once the link comes up I can scrape required data to get my csv.
What I want:
I am having trouble getting that link to the beautiful soup URL. I basically want to automate it so that I just have to enter a zip code and it gives me my CSV.
What I am able to get:
I am able to enter the zip code and search using selenium and then add that url to my scraper to give csv.
Code I am using for selenium :
driver = webdriver.Chrome('/Users/akashgupta/Desktop/Courses and Learning/Automating Python and scraping/chromedriver')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
#web scraping Part:
url="https://forecast.weather.gov/MapClick.php?lat=32.99802500000004&lon=-96.79775499999994#.Xo5LnFNKgWo"
res= requests.get(url)
soup=BeautifulSoup(res.content,'html.parser')
tag=soup.find_all('div',id='seven-day-forecast-body')
weekly=soup.find_all(class_='tombstone-container')
main=soup.find_all(class_='period-name')
description=soup.find_all(class_='short-desc')
temp=soup.find_all(class_='temp')
Period_Name=[]
Desc=[]
Temp=[]
for a in range(0,len(main)):
Period_Name.append(main[a].get_text())
Desc.append(description[a].get_text())
Temp.append(temp[a].get_text())
df = pd.DataFrame(list(zip(Period_Name, Desc,Temp)),columns =['Period_Name', 'Short_Desc','Temperature'])
from selenium import webdriver
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome('chromedriver.exe')
driver.get('https://www.weather.gov/')
messageField = driver.find_element_by_xpath('//*[#id="inputstring"]')
messageField.click()
messageField.send_keys('75252')
time.sleep(3)
showMessageButton = driver.find_element_by_xpath('//*[#id="btnSearch"]')
showMessageButton.click()
WebDriverWait(driver, 10).until(EC.url_contains("https://forecast.weather.gov/MapClick.php")) # here you are waiting until url will match your output pattern
currentURL = driver.current_url
print(currentURL)
time.sleep(3)
driver.quit()
#web scraping Part:
res= requests.get(currentURL)
....

Format string to XML file

I want to reformat a string to the XML structure, but my string is not on an XML format (using Python 2.7).
I believe the correct way is to first create an XML format of the input in one line and then use XML Pretty Print for making it an XML file with multi rows and indentation (
Pretty printing XML in Python).
Below there is an example of an input after a History Server REST API's call to Hadoop server 1.
Input:
'{"jobAttempts":{"jobAttempt":[{"nodeHttpAddress":"slave2:8042","nodeId":"slave2:39637","id":1,"startTime":1544691730439,"containerId":"container_1544631848492_0013_01_000001","logsLink":"http://23.22.43.90:19888/jobhistory/logs/slave2:39637/container_1544631848492_0013_01_000001/job_1544631848492_0013/hadoop2"}]}}'
Output:
'<jobAttempts><jobAttempt><nodeHttpAddress>slave2:8042</nodeHttpAddress><nodeId>slave2:39637</nodeId><id>1</id><startTime>1544691730439</startTime><containerId>container_1544631848492_0013_01_000001</containerId><logsLink>http://23.22.43.90:19888/jobhistory/logs/slave2:39637/container_1544631848492_0013_01_000001/job_1544631848492_0013/hadoop2</logsLink></jobAttempt></jobAttempts>'
Final Output
<jobAttempts>
<jobAttempt>
<nodeHttpAddress>slave2:8042</nodeHttpAddress>
<nodeId>slave2:39637</nodeId>
<id>1</id>
<startTime>1544691730439</startTime>
<containerId>container_1544631848492_0013_01_000001</containerId>
<logsLink>http://23.22.43.90:19888/jobhistory/logs/slave2:39637/container_1544631848492_0013_01_000001/job_1544631848492_0013/hadoop2</logsLink>
</jobAttempts>
</jobAttempt>
*This string is actually an XML file which does not appear to have any style information associated with it.
I have found out that the source view of the History Server REST API's is indeed an XML file in one line. Thus, I had to read the source view and not the old problematic view with python.
Before I used
import urllib2
contents = urllib2.urlopen("http://http://23.22.43.90:19888/ws/v1/history/mapreduce/jobs/job_1544631848492_0013//jobattempts").read()
Now, I am downloading the source view of the html page with selenium and BeautifulSoup and I save it locally.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import xml.dom.minidom
driver = webdriver.Firefox()
driver.get("http://23.22.43.90:19888/ws/v1/history/mapreduce/jobs/job_1544631848492_0013/jobattempts")
page_source = driver.page_source
driver.close()
soup = BeautifulSoup(page_source, "html.parser")
print(soup)
xml = xml.dom.minidom.parseString(str(soup))
pretty_xml_as_string = xml.toprettyxml()
file = open("./content_new_2.xml", 'w')
file.write(pretty_xml_as_string)
file.close()

PDF to Word Doc in Python

I've read though the other stack overflow questions regarding this but it doesn't answer my issue, so down vote away. Its version 2.7.
All I want to do is use python to convert a PDF to a Word doc. At minimum convert to text so I can copy and paste into a word doc.
This is the code I have so far. All it prints is the female gender symbol.
Is my code wrong? Am I approaching this wrong? Do some PDFs just not work with PDFMiner? Do you know of any other alternatives to accomplish my goal of converting a PDF to Word, besides using PyPDF2 or PDFMiner?
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file('Bottom Dec.pdf', 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
fp.close()
device.close()
retstr.close()
return text
print convert_pdf_to_txt(1)
from pdf2docx import Converter
pdf_file = 'E:\Muhammad UMER LAR.pdf'
doc_file= 'E:\Lari.docx'
c=Converter(pdf_file)
c.convert(doc_file)
c.close()
Another alternative solution is Aspose.Words Cloud SDK for Python, you can install it from pip for PDF to DOC conversion.
import asposewordscloud
import asposewordscloud.models.requests
api_client = asposewordscloud.ApiClient()
api_client.configuration.host = 'https://api.aspose.cloud'
# Get AppKey and AppSID from https://dashboard.aspose.cloud/
api_client.configuration.api_key['api_key'] = 'xxxxxxxxxxxxxxxxxxxxx' # Put your appKey here
api_client.configuration.api_key['app_sid'] = 'xxxxxxxxx-xxxx-xxxxx-xxxx-xxxxxxxxxx' # Put your appSid here
words_api = asposewordscloud.WordsApi(api_client)
filename = '02_pages.pdf'
remote_name = 'TestPostDocumentSaveAs.pdf'
dest_name = 'TestPostDocumentSaveAs.doc'
#upload PDF file to storage
request_stoarge = asposewordscloud.models.requests.UploadFileRequest(filename,remote_name)
response = words_api.upload_file(request_stoarge)
#Convert PDF to DOC and save to storage
save_options = asposewordscloud.SaveOptionsData(save_format='doc', file_name=dest_name)
request = asposewordscloud.models.requests.SaveAsRequest(remote_name, save_options)
result = words_api.save_as(request)
print("Result {}".format(result))
I'm developer evangelist at Aspose.