Download all csv files from URL - python-2.7

I want to download all csv files, any idea how I do this?
from bs4 import BeautifulSoup
import requests
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in soup.findAll("a"):
print link.get("href")

Something like this should work:
from bs4 import BeautifulSoup
from time import sleep
import requests
if __name__ == '__main__':
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in soup.findAll("a"):
current_link = link.get("href")
if current_link.endswith('csv'):
print('Found CSV: ' + current_link)
print('Downloading %s' % current_link)
sleep(10)
response = requests.get('http://www.football-data.co.uk/%s' % current_link, stream=True)
fn = current_link.split('/')[0] + '_' + current_link.split('/')[1] + '_' + current_link.split('/')[2]
with open(fn, "wb") as handle:
for data in response.iter_content():
handle.write(data)

You just need to filter the hrefs which you can do with a css selector,a[href$=.csv] which will find the href's ending in .csv then join each to the base url, request and finally write the content:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
from os.path import basename
base = "http://www.football-data.co.uk/"
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in (urljoin(base, a["href"]) for a in soup.select("a[href$=.csv]")):
with open(basename(link), "w") as f:
f.writelines(requests.get(link))
Which will give you five files, E0.csv, E1.csv, E2.csv, E3.csv, E4.csv with all the data inside.

Related

Pandas writes only the last line in a CSV File

i'm scraping urls from a txt file and export it to a csv file. But after all the process my code writes only the information from the last url. My guess is that i'm forgetting a loop. But where?
Here's my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
records = []
for pagetxt in final:
print pagetxt.text
records.append((pagetxt.text))
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')
Thanks
When you get data from file you keep only last value in variable final. Try to append data earlier (I've marked changes with #####):
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
records = [] ######
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
for pagetxt in final: ######
print pagetxt.text ######
records.append((pagetxt.text)) ######
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')

How to Run Multiples URLs in Requests from a File

I'm trying to scrap multiples websites from URLs in a txt file. There's one url per line.
my code is:
Import requests
from bs4 import BeautifulSoup
file = open('url.txt', 'r')
filelines = file.readline()
urllist = requests.get(filelines)
soup = BeautifulSoup(urllist.content, "html.parser")
content = soup.find_all("span", {"class": "title-main-info"})
print content
But it prints only the last url content (last line). What i'm doing wrong?
Thanks
Try this. It should work:
import requests
from bs4 import BeautifulSoup
with open('url.txt', 'r') as f:
for links in f.readlines():
urllist= requests.get(links.strip())
soup = BeautifulSoup(urllist.content, "html.parser")
content = soup.find_all("span", {"class": "title-main-info"})
print content

Where to put scraper script in django so as to access django database

I have a script which populates a database with initial data(football matches) from a web scraper..Currently I have put it under init.py but looks and sounds wrong..Here is my code in init.py..Where should I put this code so that it updates the matches field in my database?..Kindly help
from .models import Betting
import re
import requests
import urllib
from bs4 import BeautifulSoup
urls= "https://sms.betyetu.co.ke/tomorrow.html"
htmlfile = urllib.urlopen(urls)
htmltext = htmlfile.read()
soup = BeautifulSoup(htmltext, "html.parser")
for i in soup.findAll("table"):
try:
momo = 1
a = len( i.contents[3].contents)
while momo <= a:
foo = Betting(matches=i.contents[3].contents[momo].findAll("td")[2].text)
momo += 2
**strong text** foo.save()
print i.contents[3].contents[3].findAll("td")[0].text
except:
momo = 1

How to crawl latest articles in a specific site using specific set keyword?

I am trying a python code for crawling article links on specific sites based on key word like name of the article.but i didn't get the links appropriate links.
import sys
import requests
from bs4 import BeautifulSoup
import urllib.request
from urlparse import urlparse
def extract_article_links(url,data):
req = urllib.request.Request(url,data)
response = urllib.request.urlopen(req)
responseData = response.read()
#r = requests.get(url)
soup = BeautifulSoup(responseData.content)
links = soup.find_all('a')
for link in links:
try:
#if 'http' in link:
print ("<a href='%s'>%s</a>" % (link.get('href'),link.text))
except Exception as e :
print (e)
responseData = soup.find_all("div",{"class:info"})
print responseData
for item in responseData:
print (item.contents[0].text)
print (item.contents[1].text)
if __name__ == "__main__":
from sys import argv
if (len(argv)<2):
print"Insufficient arguments..!!"
sys.exit(1)
url = sys.argv[1]
values = {'s':'article','submit':'search'}
data = urlparse.urlencode(values)
data = data.encode('utf-8')
extract_article_links(url,data)
Try lxml, analyze the html and locate elements you are looking for, then you can do this easily with xpath :
from lxml import html
print map (lambda link: link, html.fromstring(source).xpath('//a/#href'))
of course you need to modify the xpath according to the attribute you are looking for.
try this
import requests
from bs4 import BeautifulSoup
def extract_article_links(url,data):
soup = BeautifulSoup(requests.get('http://www.hindustantimes.com/Search/search.aspx?q={}&op=All&pt=all&auth=all'.format(data)).content)
responseData = soup.find("ul",{'class':'searchNews'})
_a, _li = responseData.find_all('a'), responseData.find_all('li')
for i,j in zip(_a,_li):
print '='*40,'\nLink: ',i['href'], '\nTitle: ',i.contents[0], '\nContent: \n\t', j.p.get_text(),'\n'
if __name__ == "__main__":
url = "http://www.hindustantimes.com/"
extract_article_links(url,'article')

How can I get all the software link?

I have this code:
import urllib
import urlparse
from bs4 import BeautifulSoup
url = "http://www.downloadcrew.com/?act=search&cat=51"
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
pass
But when I run it, I only get 20 links only. The output should be more than 20 links.
Because you only download the first page of content.
Just use a loop to donwload all pages:
import urllib
import urlparse
from bs4 import BeautifulSoup
for i in xrange(3):
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if you do'nt know the count of pages, you can
import urllib
import urlparse
from bs4 import BeautifulSoup
i = 0
while 1:
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
has_more = 0
for a in soup.select("div.productListingTitle a[href]"):
has_more = 1
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if has_more:
i += 1
else:
break
I run it on my computer and it get 60 link of three pages.
Good luck~