Unicode and urllib.open - python-2.7

I am creating an application in python that can parse weather data from yr.no in Python. It works fine with regular ASCII strings, but fails when I use unicode.
def GetYRNOWeatherData(country, province, place):
#Parse the XML file
wtree = ET.parse(urllib.urlopen("http://www.yr.no/place/" + string.replace(country, ' ', '_').encode('utf-8') + "/" + string.replace(province, ' ', '_').encode('utf-8') + "/" + string.replace(place, ' ', '_').encode('utf-8') + "/forecast.xml"))
For example, when I try
GetYRNOWeatherData("France", "Île-de-France", "Paris")
I get this error
'charmap' codec can't encode character u'\xce' in position 0: character maps to <undefined>
Is it true that urllib doesn't handle unicode very well? Since I am using Tkinter as a frontend to this function, would that be the source of the problem (does the Tkinter Entry widget handle unicode well?)

You can handle this by keeping every string as a unicode right up until you actually make the urllib.urlopen request, at which point you encode to utf-8:
# -*- coding: utf-8 -*-
# This import makes all literal strings in the file default to
# type 'unicode' rather than type 'str'. You don't need to use this,
# but you'd need to do u"France" instead of just "France" below, and
# everywhere else you have a string literal.
from __future__ import unicode_literals
import urllib
import xml.etree.ElementTree as ET
def do_format(*args):
ret = []
for arg in args:
ret.append(arg.replace(" ", "_"))
return ret
def GetYRNOWeatherData(country, province, place):
country, province, place = do_format(country, province, place)
url = "http://www.yr.no/place/{}/{}/{}/forecast.xml".format(country, province, place)
wtree = ET.parse(urllib.urlopen(url.encode('utf-8')))
return wtree
if __name__ == "__main__":
GetYRNOWeatherData("France", "Île-de-France", "Paris")


Extract zip file with swedish ÅÄÖ in the filenames in python2.7

When I extract my zip file containing a file with Å, Ä or Ö letters,
I get garbage characters.
Im using python 2.7.
with zipfile.ZipFile(temp_zip_path.decode('utf-8')) as f:
for fn in f.namelist():
extracted_path = f.extract(fn)
Zipfile assumes that the encoding of the filenames is CP437. If your zipfile encoding is not unicode, you need to decode file/directory names if they contain accented letters in order to see the non-garbaged name. But if you try to extract contents based on the decoded string, it won't be found, because zipfile will find stuff by the original (garbage or not) name.
You could rename the files one by one after extracting but that would be painful.
What you could do is something like this: read the contents and write them on the decoded name.
# -*- coding: utf-8 -*-
import zipfile
import os
temp_zip_path = r'd:\Python_projects\sandbox\cp_437.zip'
temp_zip_path2 = r'd:\Python_projects\sandbox\unicode.zip'
target_loc = os.path.dirname(os.path.realpath(__file__))
def unpack_cp437_or_unicode(archive_path):
with zipfile.ZipFile(archive_path) as zz:
for zipped_name in zz.namelist():
real_name = zipped_name.decode('cp437')
except UnicodeEncodeError:
real_name = zipped_name
with zz.open(zipped_name) as archived:
contents = archived.read()
if zipped_name.endswith('/'):
dirname = os.path.join(target_loc, real_name)
if not os.path.isdir(dirname):
with open(os.path.join(target_loc, real_name), 'wb') as target:

django for production of my recommender system

I have written an content_based recommender system in python3 using the data from a mysql database. Now i have to use django for production so that I need not to take input each time new articles are added in the database. How to convert this python code into django production. i will connect the database with django database connections. I am really confused how to write this code in django?
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from string import punctuation
import functools
from matplotlib import pyplot as plt
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords =True)
from tqdm import tqdm_notebook
import numpy as np
import math
from sklearn.metrics.pairwise import linear_kernel
#import text
from collections import Counter
df = pd.read_csv('target.csv')
df = df.loc[:,['id','combined_text']].astype(str)
df["combined_text"] = df["combined_text"].apply(lambda x: ' '.join(pd.unique(x.split())))
df.combined_text = df.combined_text.apply(lambda x: x.lower())
df.combined_text = df.combined_text.str.replace('[^\w\s]',' ')
df['combined_text'] = df['combined_text'].str.replace('\d+', ' ')
df.combined_text = df.combined_text.str.replace('nbsp?' , ' ')
#df.combined_text = df.combined_text.str.replace('nan?' , ' ')
df.combined_text = df.combined_text.str.replace('value?' , ' ')
df = df.dropna(subset = ['combined_text'])
df.combined_text = df.combined_text.str.replace('\s+', ' ')
#df.combined_text.map(len).hist(figsize=(15, 5), bins=100)
df = df[(df.combined_text.map(len) > 600)]
df.reset_index(inplace=True, drop=True)
#df1 = df[(df.combined_text.map(len) > 7500)]
stop_words = []
f = open('stopwords.txt', 'r')
for l in f.readlines():
stop_words.append(l.replace('\n', ''))
additional_stop_words = ['t','aah','aap','don','doesn','isn','ve','ll','add', 'ndash','will','nan','q','article','lsquo','rsquo','ldquo','rdquo','personalised','please','read','download','app','here','more','experience','based','explore','bull','fact','myth','ndash','middot','lifestage','entire','collection','articles','reading','website','android','phone','a','zero']
stop_words += additional_stop_words
stop_words = list(filter(None, stop_words))
def _removeNonAscii(s):
return "".join(i for i in s if ord(i)<128)
def clean_text(text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = text.replace('(ap)', '')
text = re.sub(r"\'s", " is ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r'\W+', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r"\\", "", text)
text = re.sub(r"\'", "", text)
text = re.sub(r"\"", "", text)
text = re.sub('[^a-zA-Z ?!]+', '', text)
text = _removeNonAscii(text)
text = text.strip()
return text
def tokenizer(text):
text = clean_text(text)
tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]
tokens = list(functools.reduce(lambda x,y: x+y, tokens))
tokens = list(filter(lambda token: token not in (stop_words + list(punctuation)) , tokens))
return tokens
#df['combined_text'] = df['combined_text'].map(lambda d: str.encode(d.decode('utf-8')))
df['tokens'] = ''
df['tokens'] = df['combined_text'].progress_map(lambda d: tokenizer(d))
df['text_stemmed']=df['tokens'].apply(lambda x : [stemmer.stem(y) for y in x])
df['text_stemmed_sentence']=df['text_stemmed'].apply(lambda x : " ".join(x))
df['stemmed_tokens'] = df['text_stemmed_sentence'].progress_map(lambda d: tokenizer(d))
df = df[['id','text_stemmed_sentence','stemmed_tokens']]
# =============================================================================
# for descripition, tokens in zip(df['combined_text'].head(5), df['tokens'].head(5)):
# print('description:', descripition)
# print('tokens:', tokens)
# print()
# =============================================================================
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5, analyzer='word', ngram_range=(1, 2), stop_words='english')
vz = vectorizer.fit_transform(list(df['stemmed_tokens'].map(lambda tokens: ' '.join(tokens))))
cosine_similarities = linear_kernel(vz,vz)
articlesRecommend = pd.DataFrame(cosine_similarities, columns = df.id, index = df.id)
y = np.array([articlesRecommend[c].nlargest(10).index.values for c in articlesRecommend])
articles_df = pd.DataFrame(data = y, index = articlesRecommend.columns)
The complete answer to this question will be lengthy but i can just wrap it up simply as:
First make virtualenv and install django. Also, you will need to install all the python packages that you used in your python program like pandas etc.
run this simple command django-admin startproject <project_name>. Next, run django-admin startapp <app_name> this is for making an app in the django project since django can have many apps.
Open source/source/settings.py and in your INSTALLED_APPS list, mention the name of your app.
You will need to render the same code in /views.py . But there should be atleast one function with request argument which will achieve the same task.
Something like this:
import pandas # and import other libs
def some_func(request):
## your code
Next you will have to map this function with url in urls.py, that is something you can find here: mapping the urls to functions in views.py
Of course, you will have to run the server using python manage.py runserver, you'll be able to locate your project on
Honestly, If you understand the basic architechure of django, this is very easy task to do. This documentation can be of help to you.
Coming to the crux of your question:
Since you explained that you'll be suggesting most related articles on the basis of already existing article. First, the source of the data from your Laravel project should flow data in JSON format, you can read that data in your views.py 's functions, once your read the data and run your already working code, next you should be able to send the most related articles information like id or something back through some url. For this purpose you can either do this using Django's rest framework or simply return JsonResponse from your function.

How can i clean urdu data corpus Python without nltk

I have a corpus of more that 10000 words in urdu. Now what i want is to clean my data. There appear a special uni coded data in my text like "!؟ـ،" whenever i use regular expressions it gives me error that your data is not in encoded form.
Kindly provide me some help to clean my data.
Thank you
Here is my sample data:
I used your sample to find all words with ہ or ر
Notice that I had to tell python that I am dealing with utf-8 data by using u in front of the regex string as well as the data string
import re
data = u"""
result = re.findall(u'[^\s\n]+[ہر][^\s\n]+',data,re.MULTILINE)
The output was
['ظہیر', 'ماہرہ', 'تصاویر،', 'پہنچایا', '،ہوا']
another example, removes all none alphabets except whitespace and makes sure only one whitespace separates the words
result = re.sub(' +',' ',re.sub(u'[\W\s]',' ',data))
the output is
ظہیر احمد ماہرہ خان کی تصاویر نے دائیں اور بائیں والوں کو آسمانوں پر پہنچایا ہوا ہے دائیں والے
you can also use word tokanizer,
import nltk
result = nltk.tokenize.wordpunct_tokenize(data)
the output will be
['ظہیر', 'احمد', 'ماہرہ'
, 'خان', 'کی', '،', 'تصاویر'
, '،', 'نے', 'دائیں', 'اور', 'بائیں', 'والوں'
, 'کو', 'آسمانوں', 'پر', 'پہنچایا'
, '،', 'ہوا', 'ہے', '۔', 'دائیں', '؟', 'والے']
Edit ... for Python 2.7 you have to specify the encoding at the beginning of the code file as well as telling re that the regex is 'unicode' using re.UNICODE
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import re
data = u"""ظہیر
result = re.sub(ur'\s+',u' ',re.sub(ur'[\W\s]',ur' ',data,re.UNICODE),re.UNICODE)
also note the use of ur'' to specify the string is a unicode regex string

why cleaning text function doens't work without decoding to UTF8?

I wrote the following function in python 2.7 to clean the text but it doesn't work without decoding the tweet variable to utf8
# -*- coding: utf-8 -*-
import re
def clean_tweet(tweet):
tweet = re.sub(u"[^\u0622-\u064A]", ' ', tweet, flags=re.U)
return tweet
if __name__ == "__main__":
s="sadfas سيبس sdfgsdfg/dfgdfg ffeee منت منشس يت??بمنشس//تبي منشكسميكمنشسكيمنك ٌاإلا رًاٌااًٌَُ"
print "not working "+clean_tweet(s)
print "working "+clean_tweet(s.decode("utf-8"))
Could any one explain why?
Because I don't want to use the decoding as it makes the manipulation of the text in Sframe in graphlab is too slow.

Selecting nodes with non-ASCII characters in Scrapy

I have the following simple web scraper written in Scrapy:
#!/usr/bin/env python
# -*- coding: latin-1 -*-
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class MySpiderTest(BaseSpider):
name = 'MySpiderTest'
allowed_domains = ["boliga.dk"]
start_urls = ["http://www.boliga.dk/bbrinfo/3B71489C-AEA0-44CA-A0B2-7BD909B35618",]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = bbrItem()
print hxs.select("id('unitControl')/div[2]/table/tbody/tr[td//text()[contains(.,'Antal Badeværelser')]]/td[2]/text()").extract()
but when I run the spider I get the following syntax error:
SyntaxError: Non-ASCII character '\xe6' in file... on line 32, but no encoding declared
because of the æ in the xpath. The xpath is working in Xpath Checker for Firefox. I tried URL-encoding the æ, but that didn't work. What am I missing?
UPDATE: I have added the encoding declaration in the beginning of the code (Latin-1 should support Danish characters)
Use a unicode string for your XPath expression
hxs.select(u"id('unitControl')/div[2]/table/tbody/tr[td//text()[contains(.,'Antal Badeværelser')]]/td[2]/text()").extract()
hxs.select(u"id('unitControl')/div[2]/table/tbody/tr[td//text()[contains(.,'Antal Badev\u00e6relser')]]/td[2]/text()").extract()
See Unicode Literals in Python Source Code
SyntaxError: Non-ASCII character ‘\xe2′ in file … on line 40,
but no decoding declared …
This is caused by the replacing standard characters like apostrophe (‘) by non-standard characters like quotation mark (`) during copying.
Try to edit the text copied from pdf.
repsonse.xpath("//tr[contains(., '" + u'中文字符' + "')]").extract()