Cannot update table , when comparing data between two cursors - python-2.7

I wanted to compare rows of a table to find out if they are equal or not ,
what i did was create 2 cursors
1. Select links from table where visted = yes
2. Select links from table where visted = No
Using for loop and if statement i want to compare visited links with not visited links if they are equal or not and if they are equal then update visted of that link to "YES"
Not done yet (My aim was to exit the program if all links are visted and all marked YES or the cursor for " where visited=no " returns a null value)
My portion code:
import sys
import MySQLdb
import urllib
import urlparse
import re
import HTMLParser
from HTMLParser import HTMLParseError
from bs4 import BeautifulSoup
mydb = MySQLdb.connect(host='localhost',
user='root',
passwd='shailang',
db='mydb')
cursor = mydb.cursor()
def process2(url):
flag=0
cursor.execute("SELECT links FROM DATA_urls where visited = 'Ye'")
Yes_rows = cursor.fetchall()
cursor.execute("SELECT links FROM DATA_urls where visited = 'No'")
No_rows = cursor.fetchall()
for No_links in No_rows:
print 'NOOOOOOOOOO'
k= No_links
print k
for Yes_links in Yes_rows:
print "YESSSSSSSSSSSSSS"
k1 = Yes_links
print k1
if k1 == k :
print 'EQUALS'
cursor.execute("UPDATE DATA_urls SET visited = 'Ye' where links = %s",k)
mydb.commit()
def process(url):
proxies = {"http":"http://proxy4.nehu.ac.in:3128",
"https":"https://proxy4.nehu.ac.in:3128"}
page = urllib.urlopen(url,proxies=None)
text = page.read()
page.close()
soup = BeautifulSoup(text)
file=open('s.txt','w')
cursor.execute("INSERT INTO DATA_urls(links,parent,visited) VALUES(%s,'NULL','Ye')",url)
for tag in soup.findAll('a', href=True):
tag['href'] = urlparse.urljoin(url, tag['href'])
print tag['href']
if re.match(ur'(?i)\b((?:https?:\/\/|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))',tag['href']):
cursor.execute("INSERT INTO DATA_urls(links,parent,visited) VALUES(%s,%s,'No')", (tag['href'],url))
file.write('\n')
file.write(tag['href'])
#file.close()
# cursor.execute("SELECT * FROM url")
# rows = cursor.fetchall()
mydb.commit()
process2(1)
def main():
if len(sys.argv) == 1:
print 'No url !!'
sys.exit(1)
for url in sys.argv[1:]:
process(url)
main()
I got no error , but nothing is updated in my database
My table DESC:
+---------+---------------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+---------+---------------+------+-----+---------+-------+
| links | varchar(1000) | YES | | NULL | |
| parent | varchar(1000) | YES | | NULL | |
| visited | varchar(2) | YES | | NULL | |
+---------+---------------+------+-----+---------+-------+

change it to
mydb = MySQLdb.connect(host='localhost',
user='root',
passwd='shailang',
db='mydb')
cursor = mydb.cursor()
def process2(url):
flag=0
cursor.execute("SELECT links FROM DATA_urls where visited = Ye")
Yes_rows = cursor.fetchall()
cursor.execute("SELECT links FROM DATA_urls where visited = No")
No_rows = cursor.fetchall()
count = len(No_rows)
for i in range(0, count):
print 'NOOOOOOOOOO'
k= No_links
print k
for j in range (i+1, count):
print "YESSSSSSSSSSSSSS"
k1 = Yes_links
print k1
if k1 == k :
print 'EQUALS'
cursor.execute("UPDATE DATA_urls SET visited =
'Ye' where links = %s",k)
help

Related

I want to return the elements of list into a data frame as below. I am a beginner

I attempted 3 different ways:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as soup
from selenium import webdriver
driver = webdriver.Chrome(executable_path='C:/Users/lemonade/Documents/work/chromedriver')
my_url = "https://www.carehome.co.uk/"
def make_soup(url):
driver.get(url)
m_soup = soup(driver.page_source, features='html.parser')
return m_soup
main_page = make_soup(my_url)
boroughs = [borough.text.strip() for borough in main_page.select('.seo_links.seo_links_country [href]')]
indexs = list(range(16,19))
london_list = [boroughs[i] for i in indexs]
boroughs1 = [bo.replace("Borough","") for bo in london_list]
boroughs2 = [b1.replace("&","and") for b1 in boroughs1]
boroughs3 = ['-'.join(b2.split()) for b2 in boroughs2]
borough_links = ["https://www.carehome.co.uk/care_search_results.cfm/searchunitary/" + b3 for b3 in boroughs3]
borough_soup = [make_soup(b_link) for b_link in borough_links]
for soups in borough_soup:
titles = [title.text.strip() for title in soups.select('.home-name [href]')]
return(titles)
for soups in borough_soup:
addresses = [address.text.strip() for address in soups.select('.home-name>p.grey')]
return(addresses)
df = pd.DataFrame(zip(titles, addresses), columns = ['title','address'])
print(df)
I tried the code below instead: This gave |AttributeError: 'list' object has no attribute 'text'|
title_html = [soups.select('.home-name [href]') for soups in borough_soup]
titles = [title.text.strip() for title in title_html ]
addresses_html =[soups.select('.home-name>p.grey') for soups in borough_soup]
addresses = [address.text.strip() for address in addresses_html]
I tried to create and append a list and return that list. [See Below] This just outputted a single element from the list.
def func(borough_soup):
for soups in borough_soup:
titles = [title_html.text.strip() for title_html in soups.select('.home-name [href]')]
for title in titles:
titles1 = []
titles1.append(title)
return(titles1)
Any help would be much appreciated!
This was the fix. Creating function with an empty list and then appending each element to the list. After that concating each DF
def title(x):
titles1 = []
for soups in borough_soup:
titles = [title.text.strip() for title in soups.select('.home-name [href]')]
titles1.append(titles)
return(titles1)
titles = title(borough_soup)
def address(x):
address1 = []
for soups in borough_soup:
addresses = [address.text.strip() for address in soups.select('.home-name>p.grey')]
address1.append(addresses)
return(address1)
addresses = address(borough_soup)
indexs2 = list(range(0,2))
df_list = [pd.DataFrame(zip(titles[i], addresses[i])) for i in indexs2]
df = pd.concat(df_list)

Scrapy detect if Xpath not exists

I've been trying to make my first crawler and i've acomplished what i needed ( get the 1º shop and 2º shop shipping infos and prices ) but with 2 crawlers instead of 1 because i've a big stopper here.
When there'are more than 1 shop the output result is:
In [1]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()').extract()
Out[1]:
[u'ENV\xcdO 3,95\u20ac ',
u'ENV\xcdO GRATIS',
u'ENV\xcdO GRATIS',
u'ENV\xcdO 4,95\u20ac ']
To get only the second result i'm using:
In [2]: response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
Out[2]: u'ENV\xcdO GRATIS'
But when there's no second results ( only 1 shop ) i'm getting:
IndexError: list index out of range
And the crawler skip the full page even if the other items have data ...
After trying several times i've decided to do a fast solution to get the result, 2 crawlers 1 for first shops and the other for the second one but right now i want to do it clean in only 1 crawler.
Some help, tip or advice will be appreciated, that's my first try making a recursive crawler with scrapy, kinda like it.
There's the code:
# -*- coding: utf-8 -*-
import scrapy
from Guapalia.items import GuapaliaItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class GuapaliaSpider(CrawlSpider):
name = "guapalia"
allowed_domains = ["guapalia.com"]
start_urls = (
'https://www.guapalia.com/perfumes?page=1',
'https://www.guapalia.com/maquillaje?page=1',
'https://www.guapalia.com/cosmetica?page=1',
'https://www.guapalia.com/linea-de-bano?page=1',
'https://www.guapalia.com/parafarmacia?page=1',
'https://www.guapalia.com/solares?page=1',
'https://www.guapalia.com/regalos?page=1',
)
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#class='js-pager']/a[contains(text(),'Siguientes')]"),follow=True),
Rule(LinkExtractor(restrict_xpaths="//div[#class='list-display__item list-display__item--product']/div/a[#class='col-xs-10 col-sm-10 col-md-12 clickOnProduct']"),callback='parse_articles',follow=True),
)
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')[1].extract()
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')[1].extract()
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_second_shop'] = articles_second_shop if articles_second_shop else 'N/A'
item['articles_second_shipping'] = articles_second_shipping
item['articles_name'] = articles_name
yield item
Basic output of crawler with the right format when there're more than 1 shop:
2017-09-21 09:53:11 [scrapy] DEBUG: Crawled (200) <GET https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355> (referer: https://www.guapalia.com/perfumes?page=1)
2017-09-21 09:53:11 [scrapy] DEBUG: Scraped from <200 https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355>
{'articles_first_shipping': [u'ENV\xcdO GRATIS'],
'articles_first_shop': [u'DOUGLAS'],
'articles_name': [u'ZEN edp vaporizador 100 ml'],
'articles_second_shipping': u'ENV\xcdO 3,99\u20ac ',
'articles_second_shop': u'BUYSVIP',
'articles_urls': 'https://www.guapalia.com/zen-edp-vaporizador-100-ml-75355'}
The problem is when doesn't exists a second shop because my code on the field second shop
IndexError: list index out of range
SOLUTION Thanks to #Tarun Lalwani
def parse_articles(self, response):
item = GuapaliaItem()
articles_urls = response.url
articles_first_shop = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="retailer-logo autoimage-container"]/img/#title').extract()
articles_first_shipping = response.xpath('//div[#class="container-fluid list-display-box--best-deal"]/div/div/div/div[#class="shipping"]/p//text()').extract()
articles_second_shop = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div/img/#title')
articles_second_shipping = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
articles_name = response.xpath('//div[#id="ProductDetail"]/#data-description').extract()
if len(articles_second_shop) > 1:
item['articles_second_shop'] = articles_second_shop[1].extract()
else:
item['articles_second_shop'] = 'Not Found'
if len(articles_second_shipping) > 1:
item['articles_second_shipping'] = articles_second_shipping[1].extract()
else:
item['articles_second_shipping'] = 'Not Found'
item['articles_urls'] = articles_urls
item['articles_first_shop'] = articles_first_shop
item['articles_first_shipping'] = articles_first_shipping
item['articles_name'] = articles_name
yield item
You need to get the result in a variable first. Then you can take decision based on its length
texts = response.xpath('//li[#class="container list-display-box__list__container"]/div/div/div/div/div[#class="shipping"]/p//text()')
if len(texts) > 1:
data = texts[1].extract()
elif len(text) == 1:
data = texts[0].extract()
else
data ="Not found"

How to iterate dictionaries and save in to the database in python2.7?

This is my dictionary
{u'krishna': [u'vijayawada', u'gudivada', u'avanigada']}
I want to iterate items and save in the database,my Models is
class Example(models.Model):
district = models.CharField(max_length=50,**optional)
taluk = models.CharField(max_length=20,**optional)
It should save as:
-----------------------------
|district | taluk |
|-----------|--------------- |
|krishna | vijayawada |
|-----------|----------------|
|krishna | gudivada |
|----------------------------|
|krishna | avanigada |
------------------------------
You can do something like this:
form models import Example
places = {u'krishna': [u'vijayawada', u'gudivada', u'avanigada']}
for district in places:
for taluk in district:
e = Example(district=district, taluk=taluk)
e.save()
for key in dict:
for value in dict[key]:
example = Example()
example.district = key
example.taluk = value
example.save()
This will work for you:-
for districtName in places.keys():
for talukName in places[districtName]:
print districtName,talukName #Try to print it
addData = Example.objects.create(district=districtName,taluk=talukName)
addData.save()

Beautifulsoup extraction using for loop into table in Python 2

Platform: Python 2.7.13 on Win 7 with spyder IDE
Please I'm totally new to both beautifulsoup and python. I am stuck at the last two lines.
Q. I want to import the details on the url below and put them in a table. That is the information with dd tags:
The first part of the code works well to get the link and get all the school details. However, i'm having trouble running the for command to get the remaining elements.
full code is below
# coding: utf-8
import urllib2
url = "http://tools.canlearn.ca/cslgs-scpse/cln-cln/rep-fit/p/af.p.clres.do?institution_id=default&searchType=ALL&searchString=&progLang=A&instType=B&prov_1=prov_1&progNameOnly=N&start=0&finish=999&section=1"
#try:
page = urllib2.urlopen(url)
#except (httplib.HTTPException, httplib.IncompleteRead, urllib2.URLError):
# missing.put(tmpurl)
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
rooturl = "http://tools.canlearn.ca/cslgs-scpse/cln-cln/rep-fit/p/"
from bs4 import BeautifulSoup
soup = BeautifulSoup(page)
info = soup.find_all("div", class_="wb-frm")
names = [x.ol.find_all("li") for x in info][0]
def f(string):
return str(string[0] + ', ' + string[-1])
names2 = [names[i:i+3] for i in range(0, len(names), 3)]
diploma = [ [x[0].findAll("a")[0].find(text=True).strip() ,x[1].string ,f(x[2].find(text=True).strip().split()) ] for x in names2]
links = [x.ol.find_all("a") for x in info][0]
links2 = [y.get('href') for y in links]
links3 = [rooturl + z for z in links2]
for i in xrange(len(links3)) :
url_link = urllib2.urlopen(links3[i])
link_html = BeautifulSoup(url_link)
#Changed the code here based on good answer given by heyiamt ..
#it was
# link_html2 = link_html.find_all("div", class_="wb-frm")
# website = link_html2[0].a.get('href')
# dd[y]=link2[y].get('dd')
# diploma[i].append(dd) diploma[i].append(link_html2[0].a.get('href'))
# diploma[i].append(website)
#Get the whole box for the general info
# general_info_html = link_html.find_all("div", class_="panel-body")
# general_info_html2 = [y.findAll('dd') for y in general_info_html[2:]]
# general_info = {}
# for x in general_info_html2 :
# general_info.update({x[0].find(text='dt') : x[1].find(text='dd')})
# general_info.update({x[0].get('dd')})
# diploma[i].append(general_info)
for d in link_html.find_all('dd'):
if d.a is not None:
diploma[i].append(d.a.string)
continue
if d.string is not None:
diploma[i].append(d.string)
continue
diploma[i].append(d.contents[0])
import pandas as pd
col1 = [x[1] for x in diploma]
col2 = [x[0] for x in diploma]
col3 = [x[2] for x in diploma]
col4 = [x[3] for x in diploma]
col5 = [x[4] for x in diploma]
col55 = {'Program Level' : [x.get('Program Level:') for x in col5], 'Credential Type' : [x.get('Credential Type:') for x in col5],
'Joint Program Level' : [x.get('Joint Program Level:') for x in col5],
'Joint Credential Type' : [x.get('Joint Credential Type:') for x in col5],
'Address' : [x.get('Address:') for x in col5],
'Telephone' : [x.get('Telephone:') for x in col5],
'Email' : [x.get('Email:') for x in col5],
'Fax' : [x.get('Fax:') for x in col5],
'Toll Free' : [x.get('Toll Free:') for x in col5]
}
df = pd.DataFrame(col1, columns = ['University'])
df2 = pd.DataFrame(col55)
df['Type'] = col2
df['City'] = col3
df['Website'] = col4
df['Address'] = df2['Address']
df['Credential Type'] = df2['Credential Type']
df['Email'] = df2['Email']
df['Fax'] = df2['Fax']
df['Joint Credential Type'] = df2['Joint Credential Type']
df['Joint Program Level'] = df2['Joint Program Level']
df['Program Level'] = df2['Program Level']
df['Telephone'] = df2['Telephone']
df['Toll Free'] = df2['Toll Free']
df.to_csv('data1.csv', encoding='utf-8')
Expected result: (i.e with "dd" tags)
http://www.rosewoodcollege.ca/program-information/
Apprenticeship Program Certificate
Not entered
Not entered
Calgary, Alberta T3J 5H3
(403) 798-7447
mail#rosewoodcollege.ca
For this site, you can just use BeautifulSoup to find the tags within the divs without actually scrolling through the divs themselves. These particular dd tags have a bit of fishiness to them, though. Here's a shot at managing the different possibilities.
# Using link_html from your code above.
dd_strs = []
for d in link_html.find_all('dd'):
if d.a is not None:
dd_strs.append(d.a.string)
continue
if d.string is not None:
dd_strs.append(d.string)
continue
dd_strs.append(d.contents[0])
for dd_str in dd_strs:
print dd_str
Output is
http://www.rosewoodcollege.ca/program-informatio n/
Apprenticeship Program
Certificate
Not entered
Not entered
Rosewood College
(403) 798-7447
mail#rosewoodcollege.ca
2015-12-30
If you can rely on the dt tags to always be mated, in order, to the dd tags, you can just repeat the above but for dt instead of dd and merge the lists accordingly.

Django queryset not returning distinct values

I have a query that for some reason is not returning distinct values even though I have specified distinct, I thought it may be because of the only, so I removed that, but the list is still the same
circuit_providers = CircuitInfoData.objects.only('provider').values('provider').distinct()
I just want a list of unqiue providers
model.py
from __future__ import unicode_literals
from django.db import models
import string
import random
import time
import os
# Create your models here.
from service.models import ServiceContacts
def site_photos_path(instance, filename):
file ,extension = os.path.splitext(filename)
# file will be uploaded to MEDIA_ROOT/user_<id>/<filename>
chars=string.ascii_uppercase + string.digits
random_string = ''.join(random.choice(chars) for _ in range(6))
filename = '%s-%s%s' % (random_string,time.strftime("%d-%m-%H-%M-%S"),extension)
return 'site_photos/{0}'.format(filename)
def service_upload_path(instance, filename):
file ,extension = os.path.splitext(filename)
# file will be uploaded to MEDIA_ROOT/user_<id>/<filename>
chars=string.ascii_uppercase + string.digits
random_string = ''.join(random.choice(chars) for _ in range(6))
filename = '%s-%s%s' % (random_string,time.strftime("%d-%m-%H-%M-%S"),extension)
return 'service_files/{0}'.format(filename)
def site_files_path(instance, filename):
file ,extension = os.path.splitext(filename)
# file will be uploaded to MEDIA_ROOT/user_<id>/<filename>
chars=string.ascii_uppercase + string.digits
random_string = ''.join(random.choice(chars) for _ in range(6))
filename = '%s-%s%s' % (random_string,time.strftime("%d-%m-%H-%M-%S"),extension)
return 'site_files/{0}'.format(filename)
provider_choices = (
('KCOM','KCOM'),
('BT','BT'),
('EE','EE'),
('THREE','THREE'),
)
circuit_choices = (
('DSL','DSL'),
('VDSL','VDSL'),
('MPLS','MPLS'),
('4G','4G'),
('Internet Leased Line','Internet Leased Line'),
)
subnet_mask_choices = (
('/16','/16'),
('/24','/24'),
('/25','/25'),
('/26','/26'),
('/27','/27'),
('/28','/28'),
('/29','/29'),
('/30','/30'),
('/31','/31'),
)
class ShowroomConfigData(models.Model):
location = models.CharField(max_length=50)
subnet = models.GenericIPAddressField(protocol='IPv4')
r1_loopback_ip = models.GenericIPAddressField(protocol='IPv4',verbose_name="R1 Loopback IP")
r2_loopback_ip = models.GenericIPAddressField(protocol='IPv4',verbose_name="R2 Loopback IP")
opening_date = models.DateField(verbose_name="Showroom opening date")
last_hw_refresh_date = models.DateField(verbose_name="Date of latest hardware refresh")
is_showroom = models.BooleanField(default=True,verbose_name="Is this site a showroom?")
class Meta:
verbose_name = "Showroom Data"
verbose_name_plural = "Showroom Data"
ordering = ('location',)
def __unicode__(self):
return self.location
class MajorSiteInfoData(models.Model):
location = models.CharField(max_length=200)
major_subnet = models.GenericIPAddressField(protocol='IPv4',verbose_name="Major Site Subnet")
routed_subnet = models.GenericIPAddressField(protocol='IPv4',verbose_name="Routed Link Subnet")
bgp_as = models.CharField(max_length=6,verbose_name="BGP AS Number")
class Meta:
verbose_name = "Major Site Data"
verbose_name_plural = "Major Site Data"
def __unicode__(self):
return self.location
class CircuitInfoData(models.Model):
showroom_config_data = models.ForeignKey(ShowroomConfigData,verbose_name="Install Showroom")
major_site_info = models.ForeignKey(MajorSiteInfoData,verbose_name="Install Site")
circuit_type = models.CharField(max_length=100,choices=circuit_choices)
circuit_speed = models.IntegerField(blank=True)
circuit_bearer = models.IntegerField(blank=True)
provider = models.CharField(max_length=200,choices=provider_choices)
ref_no = models.CharField(max_length=200,verbose_name="Reference No")
class Meta:
verbose_name = "Circuit Data"
verbose_name_plural = "Circuit Data"
ordering = ('showroom_config_data__location','circuit_speed')
def __unicode__(self):
return '%s | %s | %s | %s | %s' % (self.showroom_config_data.location,self.major_site_info.location, self.provider, self.service_type, self.ref_no)
results from shell below
[root#network-tools infternal]# python manage.py shell
Python 2.7.5 (default, Nov 20 2015, 02:00:19)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-4)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
(InteractiveConsole)
>>> from networks.models import CircuitInfoData
>>> d = CircuitInfoData.objects.values('provider').distinct()
>>> for item in d:
... print item
...
{'provider': u'BT'}
{'provider': u'BT'}
{'provider': u'KCOM'}
{'provider': u'BT'}
{'provider': u'BT'}
{'provider': u'KCOM'}
.....
>>> print d.query
SELECT DISTINCT "networks_circuitinfodata"."provider", "networks_showroomconfigdata"."location", "networks_circuitinfodata"."circuit_speed" FROM "networks_circuitinfodata" INNER JOIN "networks_showroomconfigdata" ON ("networks_circuitinfodata"."showroom_config_data_id" = "networks_showroomconfigdata"."id") ORDER BY "networks_showroomconfigdata"."location" ASC, "networks_circuitinfodata"."circuit_speed" ASC
>>>
one thing ive noticed is that when i print items in shell as above
#### with def __unicode__(self): #####
>>> from networks.models import CircuitInfoData
>>> d = CircuitInfoData.objects.only('provider').distinct()
>>> for i in d:
... print i
...
Location1 | Showroom | BT | DSL | N/A
Location2 | Showroom | BT | MPLS | XXXX
Location2 | Showroom | KCOM | MPLS | XXXX
Location3 | Showroom | BT | MPLS | XXXX
Location3 | Showroom | BT | DSL | N/A
Location4 | Showroom | KCOM | MPLS | XXXXX
...
#### with out def __unicode__(self): #####
>>> from networks.models import CircuitInfoData
>>> d = CircuitInfoData.objects.only('provider').distinct()
>>> for i in d:
... print i
...
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
CircuitInfoData_Deferred_circuit_bearer_circuit_cfb3d62ef325a6acfc8ddcb43c8ae1c6 object
...
#### with either ####
>>> for i in d:
... print i.provider
...
BT
BT
KCOM
BT
BT
KCOM
...
The documentation for distinct says
Returns a new QuerySet that uses SELECT DISTINCT in its SQL query.
This eliminates duplicate rows from the query results.
By default, a QuerySet will not eliminate duplicate rows. In practice,
this is rarely a problem, because simple queries such as
Blog.objects.all() don’t introduce the possibility of duplicate result
rows.
Distinct gives you distinct rows but you are looking at only one of the fields in the record and in that field items can be duplicated unless it has a unique constraint on it. And in this case you don't.
If you happen to be using postgresql you can do
CircuitInfoData.objects.distinct('provider')
to achieve your objective.
UPDATE:
Since you mentioned in the comments that you use sqlite, use this solution.
CircuitInfoData.objects.values('provider').distinct()
This will work because now each row has only one column. the resulting query will be similar to
SELECT DISTINCT "someapp_circuitinfodata"."name" FROM "someapp_circuitinfodata"
UPDATE 2:
Notice that you have overridden the __unicode__ function.
def __unicode__(self):
return '%s | %s | %s | %s | %s' %
(self.showroom_config_data.location,self.major_site_info.location,
self.provider, self.service_type, self.ref_no)
You are referring to the fields in the related model. This is going to be very costly (unless you use select_related). Also note that if you iterate through a queryset and use print for debug purposes it will give you misleading results (since what you are seeing is the output of __unicode__, a rather complex function)