Platform: Python 2.7.13 on Win 7 with spyder IDE
Please I'm totally new to both beautifulsoup and python. I am stuck at the last two lines.
Q. I want to import the details on the url below and put them in a table. That is the information with dd tags:
The first part of the code works well to get the link and get all the school details. However, i'm having trouble running the for command to get the remaining elements.
full code is below
# coding: utf-8
import urllib2
url = "http://tools.canlearn.ca/cslgs-scpse/cln-cln/rep-fit/p/af.p.clres.do?institution_id=default&searchType=ALL&searchString=&progLang=A&instType=B&prov_1=prov_1&progNameOnly=N&start=0&finish=999§ion=1"
#try:
page = urllib2.urlopen(url)
#except (httplib.HTTPException, httplib.IncompleteRead, urllib2.URLError):
# missing.put(tmpurl)
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
rooturl = "http://tools.canlearn.ca/cslgs-scpse/cln-cln/rep-fit/p/"
from bs4 import BeautifulSoup
soup = BeautifulSoup(page)
info = soup.find_all("div", class_="wb-frm")
names = [x.ol.find_all("li") for x in info][0]
def f(string):
return str(string[0] + ', ' + string[-1])
names2 = [names[i:i+3] for i in range(0, len(names), 3)]
diploma = [ [x[0].findAll("a")[0].find(text=True).strip() ,x[1].string ,f(x[2].find(text=True).strip().split()) ] for x in names2]
links = [x.ol.find_all("a") for x in info][0]
links2 = [y.get('href') for y in links]
links3 = [rooturl + z for z in links2]
for i in xrange(len(links3)) :
url_link = urllib2.urlopen(links3[i])
link_html = BeautifulSoup(url_link)
#Changed the code here based on good answer given by heyiamt ..
#it was
# link_html2 = link_html.find_all("div", class_="wb-frm")
# website = link_html2[0].a.get('href')
# dd[y]=link2[y].get('dd')
# diploma[i].append(dd) diploma[i].append(link_html2[0].a.get('href'))
# diploma[i].append(website)
#Get the whole box for the general info
# general_info_html = link_html.find_all("div", class_="panel-body")
# general_info_html2 = [y.findAll('dd') for y in general_info_html[2:]]
# general_info = {}
# for x in general_info_html2 :
# general_info.update({x[0].find(text='dt') : x[1].find(text='dd')})
# general_info.update({x[0].get('dd')})
# diploma[i].append(general_info)
for d in link_html.find_all('dd'):
if d.a is not None:
diploma[i].append(d.a.string)
continue
if d.string is not None:
diploma[i].append(d.string)
continue
diploma[i].append(d.contents[0])
import pandas as pd
col1 = [x[1] for x in diploma]
col2 = [x[0] for x in diploma]
col3 = [x[2] for x in diploma]
col4 = [x[3] for x in diploma]
col5 = [x[4] for x in diploma]
col55 = {'Program Level' : [x.get('Program Level:') for x in col5], 'Credential Type' : [x.get('Credential Type:') for x in col5],
'Joint Program Level' : [x.get('Joint Program Level:') for x in col5],
'Joint Credential Type' : [x.get('Joint Credential Type:') for x in col5],
'Address' : [x.get('Address:') for x in col5],
'Telephone' : [x.get('Telephone:') for x in col5],
'Email' : [x.get('Email:') for x in col5],
'Fax' : [x.get('Fax:') for x in col5],
'Toll Free' : [x.get('Toll Free:') for x in col5]
}
df = pd.DataFrame(col1, columns = ['University'])
df2 = pd.DataFrame(col55)
df['Type'] = col2
df['City'] = col3
df['Website'] = col4
df['Address'] = df2['Address']
df['Credential Type'] = df2['Credential Type']
df['Email'] = df2['Email']
df['Fax'] = df2['Fax']
df['Joint Credential Type'] = df2['Joint Credential Type']
df['Joint Program Level'] = df2['Joint Program Level']
df['Program Level'] = df2['Program Level']
df['Telephone'] = df2['Telephone']
df['Toll Free'] = df2['Toll Free']
df.to_csv('data1.csv', encoding='utf-8')
Expected result: (i.e with "dd" tags)
http://www.rosewoodcollege.ca/program-information/
Apprenticeship Program Certificate
Not entered
Not entered
Calgary, Alberta T3J 5H3
(403) 798-7447
mail#rosewoodcollege.ca
For this site, you can just use BeautifulSoup to find the tags within the divs without actually scrolling through the divs themselves. These particular dd tags have a bit of fishiness to them, though. Here's a shot at managing the different possibilities.
# Using link_html from your code above.
dd_strs = []
for d in link_html.find_all('dd'):
if d.a is not None:
dd_strs.append(d.a.string)
continue
if d.string is not None:
dd_strs.append(d.string)
continue
dd_strs.append(d.contents[0])
for dd_str in dd_strs:
print dd_str
Output is
http://www.rosewoodcollege.ca/program-informatio n/
Apprenticeship Program
Certificate
Not entered
Not entered
Rosewood College
(403) 798-7447
mail#rosewoodcollege.ca
2015-12-30
If you can rely on the dt tags to always be mated, in order, to the dd tags, you can just repeat the above but for dt instead of dd and merge the lists accordingly.
Related
So I have created this code for my research, but I want to use it for plenty of data files, I do not want to do it manually, which means retyping some lines in my code to use desired file. How to use input command in python (I work with python 2.7 on Windows OS) to use it faster, just by typing name of desired datafile. My code so far:
import iodata as io
import matplotlib.pyplot as plt
import numpy as np
import time
from scipy.signal import welch
from scipy import signal
testInstance = io.InputConverter()
start = time.time()
conversionError = io.ConversionError()
#data = testInstance.convert(r"S:\Doktorat\Python\", 1", conversionError)
data = testInstance.convert(r"/Users/PycharmProjects/Hugo/20160401", "201604010000", conversionError)
end = time.time()
print("time elapsed " + str(end - start))
if(conversionError.conversionSucces):
print("Conversion succesful")
if(conversionError.conversionSucces == False):
print("Conversion failed: " + conversionError.conversionErrorLog)
print "Done!"
# Create a new subplot for two cannals 1 & 3
a = np.amin(data.data)
Bx = data.data[0,]
By = data.data[1,]
dt = float(300)/266350
Fs = 1/dt
t = np.arange(0,300,dt*1e3)
N = len(Bx)
M = len(By)
time = np.linspace(0,300,N)
time2 = np.linspace(0,300,M)
filename = 'C:/Users/PycharmProjects/Hugo/20160401/201604010000.dat'
d = open(filename,'rb')
degree = u"\u00b0"
headersize = 64
header = d.read(headersize)
ax1 = plt.subplot(211)
ax1.set_title(header[:16] + ', ' + # station name
'Canals: '+header[32:33]+' and '+header[34:35]+ ', ' # canals
+'Temp'+header[38:43]+degree+'C' # temperature
+', '+'Time:'+header[26:32]+', '+'Date'+' '+header[16:26]) # date
plt.ylabel('Pico Tesle [pT]')
plt.xlabel('Time [ms]')
plt.grid()
plt.plot(time[51:-14], Bx[51:-14], label='Canal 1', color='r', linewidth=0.1, linestyle="-")
plt.plot(time2[1:-14], By[1:-14], label='Canal 3', color='b', linewidth=0.1, linestyle="-")
plt.legend(loc='upper right', frameon=False, )
# Create a new subplot for FFT
plt.subplot(212)
plt.title('Fast Fourier Transform')
plt.ylabel('Power [a.u.]')
plt.xlabel('Frequency Hz')
xaxis2 = np.arange(0,470,10)
plt.xticks(xaxis2)
fft1 = (Bx[51:-14])
fft2 = (By[1:-14])
plt.grid()
# Loop for FFT data
for dataset in [fft1]:
dataset = np.asarray(dataset)
freqs, psd = welch(dataset, fs=266336/300, window='hamming', nperseg=8192)
plt.semilogy(freqs, psd/dataset.size**0, color='r')
for dataset2 in [fft2]:
dataset2 = np.asarray(dataset2)
freqs2, psd2 = welch(dataset2, fs=266336/300, window='hamming', nperseg=8192)
plt.semilogy(freqs2, psd2/dataset2.size**0, color='b')
plt.show()
As you can see there are some places where it would be better to put input and when I run the code I can write names of filenames etc. to python instead of creating every single pythonfile, with specified info in the code.
Btw. I use Pycharm to my python.
If all you are trying to do is get rid of the hardcoded pathname, you should be able to format your name string with input variables
name = raw_input("Name: ")
measurement = raw_input("Measurement: ")
filename = "C:/Users/PycharmProjects/{0}/{1}".format(name, measurement)
see raw_input and string formatting
I've been trying to make my first crawler and i've acomplished what i needed ( get the 1º shop and 2º shop shipping infos and prices ) but with 2 crawlers instead of 1 because i've a big stopper here.
When there'are more than 1 shop the output result is:
In [1]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()').extract()
Out[1]:
[u'ENV\xcdO 3,95\u20ac ',
u'ENV\xcdO GRATIS',
u'ENV\xcdO GRATIS',
u'ENV\xcdO 4,95\u20ac ']
To get only the second result i'm using:
In [2]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
Out[2]: u'ENV\xcdO GRATIS'
But when there's no second results ( only 1 shop ) i'm getting:
IndexError: list index out of range
And the crawler skip the full page even if the other items have data ...
After trying several times i've decided to do a fast solution to get the result, 2 crawlers 1 for first shops and the other for the second one but right now i want to do it clean in only 1 crawler.
Some help, tip or advice will be appreciated, that's my first try making a recursive crawler with scrapy, kinda like it.
There's the code:
# -*- coding: utf-8 -*-
import scrapy
from Guapalia.items import GuapaliaItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class GuapaliaSpider(CrawlSpider):
name = "guapalia"
allowed_domains = ["guapalia.com"]
start_urls = (
'https://www.guapalia.com/perfumes?page=1',
'https://www.guapalia.com/maquillaje?page=1',
'https://www.guapalia.com/cosmetica?page=1',
'https://www.guapalia.com/linea-de-bano?page=1',
'https://www.guapalia.com/parafarmacia?page=1',
'https://www.guapalia.com/solares?page=1',
'https://www.guapalia.com/regalos?page=1',
)
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#class='js-pager']/a[contains(text(),'Siguientes')]"),follow=True),
Rule(LinkExtractor(restrict_xpaths="//div[#class='list-display__item list-display__item--product']/div/a[#class='col-xs-10 col-sm-10 col-md-12 clickOnProduct']"),callback='parse_articles',follow=True),
)
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')[1].extract()
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_second_shop'] = articles_second_shop if articles_second_shop else 'N/A'
item['articles_second_shipping'] = articles_second_shipping
item['articles_name'] = articles_name
yield item
Basic output of crawler with the right format when there're more than 1 shop:
2017-09-21 09:53:11 [scrapy] DEBUG: Crawled (200) <GET https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355> (referer: https://www.guapalia.com/perfumes?page=1)
2017-09-21 09:53:11 [scrapy] DEBUG: Scraped from <200 https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355>
{'articles_first_shipping': [u'ENV\xcdO GRATIS'],
'articles_first_shop': [u'DOUGLAS'],
'articles_name': [u'ZEN edp vaporizador 100 ml'],
'articles_second_shipping': u'ENV\xcdO 3,99\u20ac ',
'articles_second_shop': u'BUYSVIP',
'articles_urls': 'https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355'}
The problem is when doesn't exists a second shop because my code on the field second shop
IndexError: list index out of range
SOLUTION Thanks to #Tarun Lalwani
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
if len(articles_second_shop) > 1:
item['articles_second_shop'] = articles_second_shop[1].extract()
else:
item['articles_second_shop'] = 'Not Found'
if len(articles_second_shipping) > 1:
item['articles_second_shipping'] = articles_second_shipping[1].extract()
else:
item['articles_second_shipping'] = 'Not Found'
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_name'] = articles_name
yield item
You need to get the result in a variable first. Then you can take decision based on its length
texts = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
if len(texts) > 1:
data = texts[1].extract()
elif len(text) == 1:
data = texts[0].extract()
else
data ="Not found"
I am too new to python, so please forgive me for stupid questions. Thanks in advance.
I have the following data(float) printed out with bs4 and requests, with the code (print link.find_all("id"), link.text)
X a
X b
X c
Y a
Y b
Y c
Z a
Z b
Z c
Instead, I would like to save it like:
X a b c
Y a b c
Z a b c
and then save it into a text file so that I can use it afterwards. (I don't even know how to save some data into a file with python)
Welcome to Python, here's a quick example of creating a dict of lists and writing it to a text file.
from bs4 import BeautifulSoup
# import collections
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<p class="story">Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
Tillie2;
"""
soup = BeautifulSoup(html_doc, 'html.parser')
anchors = soup.find_all('a')
data = {} # collections.OrderedDict() if order matters
for item in anchors:
key = item.get('id')
if key not in data.keys():
data.update({key: [item.text]})
else:
values = data[key]
values.append(item.text)
data.update({key: values})
with open('example.txt', 'w') as f:
for key, value in data.items():
line = key + ' ' + ' '.join(value) + '\n'
f.write(line)
# example.txt
# link1 Elsie
# link3 Tillie Tillie2
# link2 Lacie
I am trying to perform feature selection for logistic regression classifier. Originally there are 4 variables: name, location, gender, and label = ethnicity. The three variables, namely the name, give rise to tens of thousands of more "features", for example, name "John Snow" will give rise to 2-letter substrings like 'jo', 'oh', 'hn'... etc. The feature set undergoes DictVectorization.
I am trying to follow this tutorial (http://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html) but I am not sure if I am doing it right since the tutorial is using a small number of features while mine has tens of thousands after vectorization. And also the plt.show() shows a blank figure.
# coding=utf-8
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import re
import random
import time
from random import randint
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
# Assign X and y variables
X = df.raw_name.values
X2 = df.name.values
X3 = df.gender.values
X4 = df.location.values
y = df.ethnicity_scan.values
# Feature extraction functions
def feature_full_name(nameString):
try:
full_name = nameString
if len(full_name) > 1: # not accept name with only 1 character
return full_name
else: return '?'
except: return '?'
def feature_avg_wordLength(nameString):
try:
space = 0
for i in nameString:
if i == ' ':
space += 1
length = float(len(nameString) - space)
name_entity = float(space + 1)
avg = round(float(length/name_entity), 0)
return avg
except:
return 0
def feature_name_entity(nameString2):
space = 0
try:
for i in nameString2:
if i == ' ':
space += 1
return space+1
except: return 0
def feature_gender(genString):
try:
gender = genString
if len(gender) >= 1:
return gender
else: return '?'
except: return '?'
def feature_noNeighborLoc(locString):
try:
x = re.sub(r'^[^, ]*', '', locString) # remove everything before and include first ','
y = x[2:] # remove subsequent ',' and ' '
return y
except: return '?'
def list_to_dict(substring_list):
try:
substring_dict = {}
for i in substring_list:
substring_dict['substring='+str(i)] = True
return substring_dict
except: return '?'
# Transform format of X variables, and spit out a numpy array for all features
my_dict13 = [{'name-entity': feature_name_entity(feature_full_name(i))} for i in X2]
my_dict14 = [{'avg-length': feature_avg_wordLength(feature_full_name(i))} for i in X]
my_dict15 = [{'gender': feature_full_name(i)} for i in X3]
my_dict16 = [{'location': feature_noNeighborLoc(feature_full_name(i))} for i in X4]
my_dict17 = [{'dummy1': 1} for i in X]
my_dict18 = [{'dummy2': random.randint(0,2)} for i in X]
all_dict = []
for i in range(0, len(my_dict)):
temp_dict = dict(my_dict13[i].items() + my_dict14[i].items()
+ my_dict15[i].items() + my_dict16[i].items() + my_dict17[i].items() + my_dict18[i].items()
)
all_dict.append(temp_dict)
newX = dv.fit_transform(all_dict)
# Separate the training and testing data sets
half_cut = int(len(df)/2.0)*-1
X_train = newX[:half_cut]
X_test = newX[half_cut:]
y_train = y[:half_cut]
y_test = y[half_cut:]
# Fitting X and y into model, using training data
lr = LogisticRegression()
lr.fit(X_train, y_train)
dv = DictVectorizer()
# Feature selection
plt.figure(1)
plt.clf()
X_indices = np.arange(X_train.shape[-1])
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
label=r'Univariate score ($-Log(p_{value})$)', color='g')
plt.show()
Warning:
E:\Program Files Extra\Python27\lib\site-packages\sklearn\feature_selection\univariate_selection.py:111: UserWarning: Features [[0 0 0 ..., 0 0 0]] are constant.
It looks like the way you split your data into training and testing sets is not working:
# Separate the training and testing data sets
X_train = newX[:half_cut]
X_test = newX[half_cut:]
If you already use sklearn, it is much more convenient to use the builtin splitting routine for this:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state=0)
I am new to django and know little of python. I am learning to draw graphs in django framework. I drew single bar-charts but have problem to draw multiple bar-chart using the database telecom_db of my project in django. However, in wxPython the following code worked fine. Could you figure out if something wrong with django in code below:
def graph(request):
figName="figGraph.png"
path="F:\MajorWorkspace\Visualisations\\"+figName
if os.path.exists(path)==False:
age_gr = []
countm = []
countf = []
import MySQLdb
db = MySQLdb.connect(host = "localhost",
user="root",
passwd = "",
db = "telecom_db")
cursor1 = db.cursor()
cursor2 = db.cursor()
cursor1.execute("select count(card_no) from demo where gender = 0 group by age_group")
cursor2.execute("select count(card_no) from demo where gender = 1 group by age_group")
numrows1 = int(cursor1.rowcount)
#numrows2 = int(cursor2.rowcount)
sum_male=0
sum_female=0
for x in range(numrows1):
row1 = cursor1.fetchone()
age_gr.append(x)
countm.append(row1[0])
sum_male+=row1[0]
row2 = cursor2.fetchone()
countf.append(row2[0])
sum_female+=row2[0]
# avg_call_group[x] = row[1]
cursor1.close()
cursor2.close()
import numpy as np
import matplotlib.pyplot as plt
N = len(age_gr)
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, countf, width, color='b')
rects2 = ax.bar(ind+width, countm, width, color='r')
# add some
ax.set_ylabel('Scores')
ax.set_title('Age group and Gender-wise Subscriber Distribution')
ax.set_xticks(ind+width)
# \n0:under 16 \n 1:16-20 \n i(<-N):16+5i-20+5i (i<4) \n 5:35-40 \n 6:40-50 \n 7:50 over
ax.set_xticklabels(('Under 16','16-20','21-25','26-30','31-35','36-40','40-50','Above 50'))
ax.legend( (rects1[0], rects2[0]), ('male', 'female') )
def autolabel(rects,sex):
# attach some text labels
hf=0
hm=0
iter=0
for rect in rects:
height = rect.get_height()
if sex==0:
hf+=height
print 'Female'
print '\n Height='+str(height)+'\n Sum_female='+str(sum_female)
pf=(height*1.00/sum_female)*100.00
print pf
ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%1.1f%%'%float(pf), ha='center', va='bottom')
iter+=1
else:
hm+=height
print 'Male'
print '\n Height='+str(height)+'\n Sum_male='+str(sum_male)
pm=(height*1.00/sum_male)*100.00
print pm
ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%1.1f%%'%float(pm), ha='center', va='bottom')
autolabel(rects1,0)
autolabel(rects2,1)
fig.savefig(path)
image_data = open(path, "rb").read()
return HttpResponse(image_data, mimetype="image/png")