How can I get all the software link? - python-2.7

I have this code:
import urllib
import urlparse
from bs4 import BeautifulSoup
url = "http://www.downloadcrew.com/?act=search&cat=51"
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
pass
But when I run it, I only get 20 links only. The output should be more than 20 links.

Because you only download the first page of content.
Just use a loop to donwload all pages:
import urllib
import urlparse
from bs4 import BeautifulSoup
for i in xrange(3):
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if you do'nt know the count of pages, you can
import urllib
import urlparse
from bs4 import BeautifulSoup
i = 0
while 1:
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
has_more = 0
for a in soup.select("div.productListingTitle a[href]"):
has_more = 1
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if has_more:
i += 1
else:
break
I run it on my computer and it get 60 link of three pages.
Good luck~

Related

Python fetch data from website

Below python code not working to fetch the data from given link. Please help me how to make it possible
import urllib2
from bs4 import BeautifulSoup
quote_page = 'http://www.smartvidya.co.in/2016/11/ugc-net-paper-1-previous-year-questions_14.html'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('div', attrs={'class': 'MsoNormal'})
print name_box
Try this :
import urllib2
from bs4 import BeautifulSoup
quote_page = 'http://www.smartvidya.co.in/2016/11/ugc-net-paper-1-previous-year-questions_14.html'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
for name_box in soup.findAll('div',attrs={'class': 'MsoNormal'}):
print name_box.text
Hope this helps!

Download all csv files from URL

I want to download all csv files, any idea how I do this?
from bs4 import BeautifulSoup
import requests
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in soup.findAll("a"):
print link.get("href")
Something like this should work:
from bs4 import BeautifulSoup
from time import sleep
import requests
if __name__ == '__main__':
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in soup.findAll("a"):
current_link = link.get("href")
if current_link.endswith('csv'):
print('Found CSV: ' + current_link)
print('Downloading %s' % current_link)
sleep(10)
response = requests.get('http://www.football-data.co.uk/%s' % current_link, stream=True)
fn = current_link.split('/')[0] + '_' + current_link.split('/')[1] + '_' + current_link.split('/')[2]
with open(fn, "wb") as handle:
for data in response.iter_content():
handle.write(data)
You just need to filter the hrefs which you can do with a css selector,a[href$=.csv] which will find the href's ending in .csv then join each to the base url, request and finally write the content:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
from os.path import basename
base = "http://www.football-data.co.uk/"
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in (urljoin(base, a["href"]) for a in soup.select("a[href$=.csv]")):
with open(basename(link), "w") as f:
f.writelines(requests.get(link))
Which will give you five files, E0.csv, E1.csv, E2.csv, E3.csv, E4.csv with all the data inside.

Where to put scraper script in django so as to access django database

I have a script which populates a database with initial data(football matches) from a web scraper..Currently I have put it under init.py but looks and sounds wrong..Here is my code in init.py..Where should I put this code so that it updates the matches field in my database?..Kindly help
from .models import Betting
import re
import requests
import urllib
from bs4 import BeautifulSoup
urls= "https://sms.betyetu.co.ke/tomorrow.html"
htmlfile = urllib.urlopen(urls)
htmltext = htmlfile.read()
soup = BeautifulSoup(htmltext, "html.parser")
for i in soup.findAll("table"):
try:
momo = 1
a = len( i.contents[3].contents)
while momo <= a:
foo = Betting(matches=i.contents[3].contents[momo].findAll("td")[2].text)
momo += 2
**strong text** foo.save()
print i.contents[3].contents[3].findAll("td")[0].text
except:
momo = 1

How to crawl latest articles in a specific site using specific set keyword?

I am trying a python code for crawling article links on specific sites based on key word like name of the article.but i didn't get the links appropriate links.
import sys
import requests
from bs4 import BeautifulSoup
import urllib.request
from urlparse import urlparse
def extract_article_links(url,data):
req = urllib.request.Request(url,data)
response = urllib.request.urlopen(req)
responseData = response.read()
#r = requests.get(url)
soup = BeautifulSoup(responseData.content)
links = soup.find_all('a')
for link in links:
try:
#if 'http' in link:
print ("<a href='%s'>%s</a>" % (link.get('href'),link.text))
except Exception as e :
print (e)
responseData = soup.find_all("div",{"class:info"})
print responseData
for item in responseData:
print (item.contents[0].text)
print (item.contents[1].text)
if __name__ == "__main__":
from sys import argv
if (len(argv)<2):
print"Insufficient arguments..!!"
sys.exit(1)
url = sys.argv[1]
values = {'s':'article','submit':'search'}
data = urlparse.urlencode(values)
data = data.encode('utf-8')
extract_article_links(url,data)
Try lxml, analyze the html and locate elements you are looking for, then you can do this easily with xpath :
from lxml import html
print map (lambda link: link, html.fromstring(source).xpath('//a/#href'))
of course you need to modify the xpath according to the attribute you are looking for.
try this
import requests
from bs4 import BeautifulSoup
def extract_article_links(url,data):
soup = BeautifulSoup(requests.get('http://www.hindustantimes.com/Search/search.aspx?q={}&op=All&pt=all&auth=all'.format(data)).content)
responseData = soup.find("ul",{'class':'searchNews'})
_a, _li = responseData.find_all('a'), responseData.find_all('li')
for i,j in zip(_a,_li):
print '='*40,'\nLink: ',i['href'], '\nTitle: ',i.contents[0], '\nContent: \n\t', j.p.get_text(),'\n'
if __name__ == "__main__":
url = "http://www.hindustantimes.com/"
extract_article_links(url,'article')

How to get all application link in a page?

I have this code:
from bs4 import BeautifulSoup
import urllib
url = 'http://www.brothersoft.com/windows/mp3_audio/midi_tools/'
html = urllib.urlopen(url)
soup = BeautifulSoup(html)
for a in soup.select('div.freeText dl a[href]'):
print "http://www.borthersoft.com"+a['href'].encode('utf-8','replace')
What I get is:
http://www.borthersoft.com/synthfont-159403.html
http://www.borthersoft.com/midi-maker-23747.html
http://www.borthersoft.com/keyboard-music-22890.html
http://www.borthersoft.com/mp3-editor-for-free-227857.html
http://www.borthersoft.com/midipiano---midi-file-player-recorder-61384.html
http://www.borthersoft.com/notation-composer-32499.html
http://www.borthersoft.com/general-midi-keyboard-165831.html
http://www.borthersoft.com/digital-music-mentor-31262.html
http://www.borthersoft.com/unisyn-250033.html
http://www.borthersoft.com/midi-maestro-13002.html
http://www.borthersoft.com/music-editor-free-139151.html
http://www.borthersoft.com/midi-converter-studio-46419.html
http://www.borthersoft.com/virtual-piano-65133.html
http://www.borthersoft.com/yamaha-9000-drumkit-282701.html
http://www.borthersoft.com/virtual-midi-keyboard-260919.html
http://www.borthersoft.com/anvil-studio-6269.html
http://www.borthersoft.com/midicutter-258103.html
http://www.borthersoft.com/softick-audio-gateway-55913.html
http://www.borthersoft.com/ipmidi-161641.html
http://www.borthersoft.com/d.accord-keyboard-chord-dictionary-28598.html
There should be 526 application links to be printed out.
But I only get twenty?
What is not enough with the code?
There's only 20 application links in a page.
You have to iterate all pages to get all links:
from bs4 import BeautifulSoup
import urllib
for page in range(1, 27+1): # currently there are 27 pages.
url = 'http://www.brothersoft.com/windows/mp3_audio/midi_tools/{}.html'.format(page)
html = urllib.urlopen(url)
soup = BeautifulSoup(html)
for a in soup.select('div.freeText dl a[href]'):
print "http://www.borthersoft.com"+a['href'].encode('utf-8','replace')