Failing to display images - django

I am writing this Django program which is a clone of Craigslist but displaying images of the searched products. The issue is I failing to display the actual image on the card, I am only getting the image icon at the top left corner of the card
import requests
from bs4 import BeautifulSoup
from django.shortcuts import render
from urllib.parse import quote_plus
from . import models
BASE_CRAIGSLIST_URL = 'https://losangeles.craigslist.org/d/services/search/bbb?query={}'
BASE_IMAGE_URL = 'https://images.craigslist.org/{}_300x300.jpg'
# Create your views here.
def home(request):
return render(request, 'base.html')
def new_search(request):
search = request.POST.get('search')
models.Search.objects.create(search=search)
final_url = BASE_CRAIGSLIST_URL.format(quote_plus(search))
response = requests.get(final_url)
data = response.text
soup = BeautifulSoup(data, features='html.parser')
post_listings = soup.find_all('li', {'class': 'result-row'})
final_postings = []
for post in post_listings:
post_title = post.find(class_='result-title').text
post_url = post.find('a').get('href')
if post.find(class_='result-price'):
post_price = post.find(class_='result-price').text
else:
post_price = 'N/A'
if post.find(class_='result-image').get('data-ids'):
post_image_id = post.find(class_='result-image').get('data-ids').split(',')[0].split(':')
post_image_url = BASE_IMAGE_URL.format(post_image_id)
print(post_image_url)
else:
post_image_url = 'https://craigslist.org/images/peace.jpg'
final_postings.append((post_title, post_url, post_price, post_image_url))
stuff_for_frontend = {
'search': search,
'final_postings': final_postings,
}
return render(request, 'my_app/new_search.html', stuff_for_frontend)

so i have figured it out, i was trying to access a single image yet the url had like a slide of images so i had to select the first image and display that one like this
post_image_id = post.find(class_='result-image').get('data-ids').split(',')[0].split(':')[1]

Related

Webscraping is buggy through AWS Lambda, but works fine in VS Code and on EC2 instance

My dependencies are fine, Lambda doesn't create any errors, code runs smoothly. I also checked memory (512MB) and timeout (5 mins). Just instead of a list of HTML divs I'm getting a list of empty lists. Interestingly there are quite a few lists nested, so it might even be the number of divs I'm trying to scrape, they're just completely empty.
import requests
from bs4 import BeautifulSoup
def lambda_handler(event, context):
url3='https://www.szybko.pl/l/na-sprzedaz/lokal-mieszkalny/Wroc%C5%82aw?assetType=lokal-mieszkalny&localization_search_text=Wroc%C5%82aw&market=aftermarket&price_min_sell=200000&price_max_sell=400000&meters_min=30&rooms_min=2'
def get_last_page3(url):
result = requests.get(url)
source = result.content
soup = BeautifulSoup(source, 'html.parser')
last_page = soup.find_all("li",{'class': 'blank'})[1].text
return int(last_page)
def get_list_of_soups3(url):
list_of_soups=[]
for page in range(1,get_last_page3(url)+1):
try:
result = requests.get(url+'&strona='+str(page))
source = result.content
soup = BeautifulSoup(source, 'html.parser')
ads = soup.find_all("div",{'class': "gt-listing-item-asset listing-item"})
list_of_soups.append(ads)
except Exception as e:
print(e)
break
return list_of_soups
all_ads3 = []
try:
for soup in get_list_of_soups3(url3):
for s in soup:
name = s.find("a")['aria-label'].replace('Szczegóły ogłoszenia - ','')
district = s.find("a",{'class': 'mapClassClick list-elem-address popup-gmaps'}).text.replace('\n','').replace(' ','').replace(', dolnośląskie','')
price = s.find("span",{'class': 'listing-price'}).text.strip().replace(' zł','').replace(' ','')[:6]
rooms = s.find("li",{'class': 'asset-feature rooms'}).text.replace(' ','')
sq = s.find("li",{'class': 'asset-feature area'}).text.replace('m²','').replace(',','.')
price_sq = s.find("span",{'class': 'listing-price'}).find('i').text.replace('zł/m²','').replace(' ','').strip()
link = s.find('a')['href'].strip()
ad=[name,district,int(price),int(rooms),round(float(sq)),int(price_sq),link]
all_ads3.append(ad)
except Exception as e:
print('error: website changed or unresponsive',e)
return get_list_of_soups3(url3)
Also, a similar code scraping a similar website works perfectly fine from both IDE and Lambda. Both Lambdas are configured in the same way.
I'm using Python with requests and beautiful soup libraries.
I was able to solve this by changing the HTML class of divs scraped in the second function. I achived this with print statement debugging.
Not sure what is the reason, my guess would be that maybe Lambda couldn't handle a photothumbnail that was included in the original div? Maybe something to do with the way ads are generated on this particular website?
Code also includes my print statements in comments and has try/except removed. The crucial change in line 29: ads = soup.find_all("div",{'class': "listing-content"})
import requests
from bs4 import BeautifulSoup
def lambda_handler(event, context):
# # Scraping url3: szybko.pl
url3='https://www.szybko.pl/l/na-sprzedaz/lokal-mieszkalny/Wroc%C5%82aw?assetType=lokal-mieszkalny&localization_search_text=Wroc%C5%82aw&market=aftermarket&price_min_sell=200000&price_max_sell=400000&meters_min=30&rooms_min=2'
def get_last_page3(url):
result = requests.get(url)
source = result.content
#print('SOURCE:',source)
soup = BeautifulSoup(source, 'html.parser')
last_page = soup.find_all("li",{'class': 'blank'})[1].text
print('PAGE:',last_page)
return int(last_page)
def get_list_of_soups3(url):
list_of_soups=[]
for page in range(1,get_last_page3(url)+1):
try:
result = requests.get(url+'&strona='+str(page))
#print('RESULT:',result)
source = result.content
soup = BeautifulSoup(source, 'html.parser')
#print('SOUP:',soup) #it's fine
ads = soup.find_all("div",{'class': "listing-content"})
#print('ADS:',ads)
list_of_soups.append(ads)
except Exception as e:
print(e)
break
return list_of_soups
all_ads3 = []
for soup in get_list_of_soups3(url3):
for s in soup:
name = s.find("a",{'class': 'listing-title-heading hide-overflow-text'}).find("div",{'class': "tooltip"}).text#.replace('Szczegóły ogłoszenia - ','')
district = s.find("a",{'class': 'mapClassClick list-elem-address popup-gmaps'}).text.replace('\n','').replace(' ','').replace(', dolnośląskie','').strip()
price = s.find("div",{'class': 'listing-title'}).find_all("span")[2]['content']#.text.strip().replace(' zł','').replace(' ','')[:6]
rooms = s.find("li",{'class': 'asset-feature rooms'}).text.replace(' ','')
sq = s.find("li",{'class': 'asset-feature area'}).text.replace('m²','').replace(',','.')
price_sq = int(price)/round(float(sq))
link = s.find('a')['href'].strip()
ad=[name,district,int(price),int(rooms),round(float(sq)),int(price_sq),link]
all_ads3.append(ad)
return len(all_ads3)

Pillow png compressing

Im making a simple app that able to compress images with jpeg and png format using Pillow library, python3 and Django. Made a simple view that able to identify formats, save compress images and give some statistics of compressing. With images in jpeg format it works really fine, i got compressicons close to 70-80% of original size, and it works really fast, but if i upload png i works much worse. Compression takes a long time, and it only 3-5% of original size. Trying to find some ways to upgrade compress script, and stuck on it.
Right now ive got this script in my compress django view:
from django.shortcuts import render, redirect, get_object_or_404, reverse
from django.contrib.auth import login, authenticate, logout
from django.contrib.auth.models import User
from django.http import HttpResponse, HttpResponseRedirect
from django.http import JsonResponse
from django.contrib import auth
from .forms import InputForm, SignUpForm, LoginForm, FTPForm
import os
import sys
from PIL import Image
from .models import image, imagenew, FTPinput
from django.views import View
import datetime
from django.utils import timezone
import piexif
class BasicUploadView(View):
def get(self, request):
return render(self.request, 'main/index.html', {})
def post(self, request):
form = InputForm(self.request.POST, self.request.FILES)
if form.is_valid():
photo = form.save(commit=False)
photo.name = photo.image.name
photo.delete_time = timezone.now() + datetime.timedelta(hours=1)
photo.user = request.user
photo.size = photo.image.size
photo = form.save()
name = (photo.name).replace(' ', '_')
picture = Image.open(photo.image)
if picture.mode in ('RGB'):
piexif.remove('/home/andrey/sjimalka' + photo.image.url)
picture.save('media/new/'+name,"JPEG",optimize=True,quality=75)
newpic = 'new/'+name
new = imagenew.objects.create(
name = name,
image = newpic,
delete_time = timezone.now() + datetime.timedelta(hours=1),
user = request.user,
)
if new.image.size < photo.image.size:
diff = round((new.image.size-photo.image.size)/float(photo.image.size)*100, 2)
else:
diff = str(round((new.image.size-photo.image.size)/float(photo.image.size)*100, 2))+' Не удалось сжать файл'
oldsize = round(photo.image.size/1000000, 2)
newsize = round(new.image.size/1000000, 2)
id = new.pk
imagenew.objects.filter(pk=id).update(size=new.image.size)
elif picture.mode != ('RGB'):
picture.save('media/new/'+name,"PNG", optimize=True, quality=75)
newpic = 'new/'+name
new = imagenew.objects.create(
name = name,
image = newpic,
delete_time = timezone.now() + datetime.timedelta(hours=1),
user = request.user,
)
if new.image.size < photo.image.size:
diff = round((new.image.size-photo.image.size)/float(photo.image.size)*100, 2)
else:
diff = str(round((new.image.size-photo.image.size)/float(photo.image.size)*100, 2))+' Не удалось сжать файл'
oldsize = round(photo.image.size/1000000, 2)
newsize = round(new.image.size/1000000, 2)
id = new.pk
imagenew.objects.filter(pk=id).update(size=new.image.size)
data = {'is_valid': True, 'name': new.image.name, 'url': new.image.url, 'diff': diff,
'oldsize':oldsize, 'newsize':newsize,}
else:
alert = 'Данный формат не поддерживается. Пожалуйста загрузите картинки форматов png или jpg(jpeg)'
data = {'is_valid': False, 'name': alert,}
return JsonResponse(data)
The question: is there any ways to make script with png upload work faster, and (that much more important) make png size compressions closer to jpeg? Maybe i should use another python library?
how tinypng works then? They compressing same png files with 50-60%
They probably reduce the colour palette from 24-bit to 8-bit. Here's a detailed answer about that - https://stackoverflow.com/a/12146901/1925257
Basic method
You can try that in Pillow like this:
picture_8bit = picture.convert(
mode='P', # use mode='PA' for transparency
palette=Image.ADAPTIVE
)
picture_8bit.save(...) # do as usual
This should work similar to what tinypng does.
If you don't want transparency, it's better to first convert RGBA to RGB and then to P mode:
picture_rgb = picture.convert(mode='RGB') # convert RGBA to RGB
picture_8bit = picture_rgb.convert(mode='P', ...)
Getting better results
Calling convert() as shown above will actually call quantize() in the background and Median Cut algorithm will be used by default for reducing the colour palette.
In some cases, you'll get better results with other algorithms such as MAXCOVERAGE. To use a different algorithm, you can just call the quantize() method directly:
picture_rgb = picture.convert(mode='RGB') # convert RGBA to RGB
picture_8bit = picture.quantize(colors=256, method=Image.MAXCOVERAGE)
You have to understand that downsizing the colour palette means that if the image has lots of colours, you will be losing most of them because 8-bit can only contain 256 colours.
The document of Pillow Image.quatize displays a more convenient way to compress png files. In a personal experiment, the following code could make png about 70% compression of the original size, which is also close to the result created by ImageMagick.
# Image.quantize(colors=256, method=None, kmeans=0, palette=None)
# method: 0 = median cut; 1 = maximum coverage; 2 = fast octree
img = img.quantize(method=2)

Python web crawler using BeautifulSoup, trouble getting URLs

so I am trying to build a dynamic web crawler to get all url links within links.
so far i am able to get all the links for Chapters, but when I trying to do section links from each chapter, my output does not print out anything.
the code i used :
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
if link.has_attr('href'):
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
#########################Sections#######################
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)
With some minor modifications to your code, I was able to get it to run and output the sections. Mainly, you needed to fix your indentation, and define a function before you call it.
#########################Chapters#######################
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup((r.content),"html.parser")
section = soup.find('div', {'class': 'primary-content' })
for sublinks in section.find_all('a'):
sectionlinks = sublinks.get('href')
print (sectionlinks)
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
for title in range (1,4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup((r.content),"html.parser",parse_only=SoupStrainer('a')):
try:
if 'chapt' in link['href']:
href = "http://law.justia.com" + link['href']
leveltwo(href)
else:
continue
except KeyError:
continue
#########################Sections#######################
output:
/codes/alabama/2015/title-3/chapter-1/section-3-1-1/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-2/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-3/index.html
/codes/alabama/2015/title-3/chapter-1/section-3-1-4/index.html etc.
You don't need any try/except blocks, you can use href=True with find or find_all to only select the anchor tags with href's or a css select a[href] as below, the chapter links are in the first ul with inside the article tag with the id #maincontent so you don't need to filter at all:
base_url = "http://law.justia.com/codes/alabama/2015/title-{title:01d}/"
import requests
from bs4 import BeautifulSoup
def leveltwo(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.content, "html.parser")
section_links = [a["href"] for a in soup.select('div .primary-content a[href]')]
print (section_links)
for title in range(1, 4):
url = base_url.format(title=title)
r = requests.get(url)
for link in BeautifulSoup(r.content, "html.parser").select("#maincontent ul:nth-of-type(1) a[href]"):
href = "http://law.justia.com" + link['href']
leveltwo(href)
If you were to use find_all you simply need to pass find_all(.., href=True) to filter your anchor tags to only select ones that have hrefs.

can't crawl all pages in a website

I was trying to crawl all the datas in all the pages . when i try to join the url i can't . I want to know what is the mistake i am doing
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
import urlparse
from data.items import TextPostItem
from scrapy import optional_features
optional_features.remove('boto')
class RedditCrawler(CrawlSpider):
name = 'reddit_crawler'
allowed_domains = ['yellowpages.com']
start_urls = ['http://www.yellowpages.com/search?search_terms=restaurant&geo_location_terms=California%2C%20KY']
custom_settings = {
'BOT_NAME': 'reddit-scraper',
'DEPTH_LIMIT': 7,
'DOWNLOAD_DELAY': 3
}
def parse(self, response):
s = Selector(response)
next_link = s.xpath('//a[#class="next ajax-page"]/#href').extract()[0]
full_link = urlparse.urljoin('http://www.yellowpages.com',next_link)
yield self.make_requests_from_url(full_link)
posts = Selector(response).xpath('//div[#class="search-results organic"]')
for post in posts:
item = TextPostItem()
item['address']= post.xpath("//p[#class='adr']//text()").extract()
item['business_name']= post.xpath("//a[#class='business-name']//text()").extract()
item['phonenumber']= post.xpath("//div[#class='phones phone primary']//text()").extract()
item['categories']=post.xpath("//div[#class='categories']//text()").extract()
item['next_link']=post.xpath("//div[#class='pagination']//a[#class='next ajax-page']//#href").extract()
yield item
I think your xpath '//div[#class="next ajax-page"]//ul//li[6]//a/#href' is incorrent. It doesn't work for me.
Try something simpler '//a[#class="next ajax-page"]/#href'

Why scrapy not iterating over all the links on the page even the xpaths are correct?

This code works perfectly fine when I pass extract()[0] or extract() - it gives me output for the first link it parsed.I am not able to understand why its doing so,bcs when I was crawling Other websites with this code it was perfectly fine.
With this website its scraping only the first link.If I change extract()[1] then it will give me second link and so on .Why its not working automatically in for loop?
import scrapy
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
name = scrapy.Field()
date = scrapy.Field()
class criticspider(BaseSpider):
name = "mmt_mouth"
allowed_domains = ["mouthshut.com"]
start_urls = ["http://www.mouthshut.com/websites/makemytripcom-reviews-925031929"]
# rules = (
# Rule(
# SgmlLinkExtractor(allow=("search=make-my-trip&page=1/+",)),
# callback="parse",
# follow=True),
# )
def parse(self, response):
sites = response.xpath('//div[#id="allreviews"]')
items = []
for site in sites:
item = CompItem()
item['name'] = site.xpath('.//li[#class="profile"]/div/a/span/text()').extract()[0]
item['title'] = site.xpath('.//div[#class="reviewtitle fl"]/strong/a/text()').extract()[0]
item['date'] = site.xpath('.//div[#class="reviewrate"]//span[#class="datetime"]/span/span/span/text()').extract()[0]
item['link'] = site.xpath('.//div[#class="reviewtitle fl"]/strong/a/#href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//div[#itemprop="description"]/p/text()').extract()
yield old_item
Because your for loop has nothing to loop on the given website. Change your statement
sites = response.xpath('//div[#id="allreviews"]')
to
sites = response.xpath('//div[#id="allreviews"]/ul/li')
Then your for loop can loop over the list elements.