Pandas writes only the last line in a CSV File - python-2.7

i'm scraping urls from a txt file and export it to a csv file. But after all the process my code writes only the information from the last url. My guess is that i'm forgetting a loop. But where?
Here's my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
records = []
for pagetxt in final:
print pagetxt.text
records.append((pagetxt.text))
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')
Thanks

When you get data from file you keep only last value in variable final. Try to append data earlier (I've marked changes with #####):
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
records = [] ######
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
for pagetxt in final: ######
print pagetxt.text ######
records.append((pagetxt.text)) ######
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')

Related

How to Run Multiples URLs in Requests from a File

I'm trying to scrap multiples websites from URLs in a txt file. There's one url per line.
my code is:
Import requests
from bs4 import BeautifulSoup
file = open('url.txt', 'r')
filelines = file.readline()
urllist = requests.get(filelines)
soup = BeautifulSoup(urllist.content, "html.parser")
content = soup.find_all("span", {"class": "title-main-info"})
print content
But it prints only the last url content (last line). What i'm doing wrong?
Thanks
Try this. It should work:
import requests
from bs4 import BeautifulSoup
with open('url.txt', 'r') as f:
for links in f.readlines():
urllist= requests.get(links.strip())
soup = BeautifulSoup(urllist.content, "html.parser")
content = soup.find_all("span", {"class": "title-main-info"})
print content

Why cant i append data to a newline in python 2.7?

This is the code that i have written :
import urllib2
import codecs
import urllib
import re
from bs4 import BeautifulSoup
from lxml.html import fromstring
import codecs
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
resp = urllib.urlopen(url)
respData =resp.read()
paras = re.findall(r'<p>(.*?)</p>',str(respData))
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "w+", encoding="utf-8") as f:
f.write(webpage_title)
soup = BeautifulSoup(htmltext,"lxml")
ut_container = soup.find("div", {"class": "ut-container"})
time = ut_container.find("none").text.strip()
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "a+",encoding="utf-8") as f:
f.write(time)
The output that is written to the file is :
IIT Bombay: Bird’s eye view and quantum biologyApril 22, 2017 18:56 IST
I want the output to be saved like this :
IIT Bombay: Bird’s eye view and quantum biology
April 22, 2017 18:56 IST
Since it is very general, I am just giving an idea for this context.
You need to just put a new line after writing webpage_title.
f.writelines(webpage_title)
f.write("\n")
I used windows style "\r\n".It works like a charm :
import urllib2
import codecs
import urllib
import re
from bs4 import BeautifulSoup
from lxml.html import fromstring
import codecs
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
resp = urllib.urlopen(url)
respData =resp.read()
paras = re.findall(r'<p>(.*?)</p>',str(respData))
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "w+", encoding="utf-8") as f:
f.write(webpage_title+"\r\n")
soup = BeautifulSoup(htmltext,"lxml")
ut_container = soup.find("div", {"class": "ut-container"})
time = ut_container.find("none").text.strip()
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "a+",encoding="utf-8") as f:
f.write(time)

Download all csv files from URL

I want to download all csv files, any idea how I do this?
from bs4 import BeautifulSoup
import requests
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in soup.findAll("a"):
print link.get("href")
Something like this should work:
from bs4 import BeautifulSoup
from time import sleep
import requests
if __name__ == '__main__':
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in soup.findAll("a"):
current_link = link.get("href")
if current_link.endswith('csv'):
print('Found CSV: ' + current_link)
print('Downloading %s' % current_link)
sleep(10)
response = requests.get('http://www.football-data.co.uk/%s' % current_link, stream=True)
fn = current_link.split('/')[0] + '_' + current_link.split('/')[1] + '_' + current_link.split('/')[2]
with open(fn, "wb") as handle:
for data in response.iter_content():
handle.write(data)
You just need to filter the hrefs which you can do with a css selector,a[href$=.csv] which will find the href's ending in .csv then join each to the base url, request and finally write the content:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
from os.path import basename
base = "http://www.football-data.co.uk/"
url = requests.get('http://www.football-data.co.uk/englandm.php').text
soup = BeautifulSoup(url)
for link in (urljoin(base, a["href"]) for a in soup.select("a[href$=.csv]")):
with open(basename(link), "w") as f:
f.writelines(requests.get(link))
Which will give you five files, E0.csv, E1.csv, E2.csv, E3.csv, E4.csv with all the data inside.

Where to put scraper script in django so as to access django database

I have a script which populates a database with initial data(football matches) from a web scraper..Currently I have put it under init.py but looks and sounds wrong..Here is my code in init.py..Where should I put this code so that it updates the matches field in my database?..Kindly help
from .models import Betting
import re
import requests
import urllib
from bs4 import BeautifulSoup
urls= "https://sms.betyetu.co.ke/tomorrow.html"
htmlfile = urllib.urlopen(urls)
htmltext = htmlfile.read()
soup = BeautifulSoup(htmltext, "html.parser")
for i in soup.findAll("table"):
try:
momo = 1
a = len( i.contents[3].contents)
while momo <= a:
foo = Betting(matches=i.contents[3].contents[momo].findAll("td")[2].text)
momo += 2
**strong text** foo.save()
print i.contents[3].contents[3].findAll("td")[0].text
except:
momo = 1

Pandas Bad Lines Warning Capture

Is there any way in Pandas to capture the warning produced by setting error_bad_lines = False and warn_bad_lines = True? For instance the following script:
import pandas as pd
from StringIO import StringIO
data = StringIO("""a,b,c
1,2,3
4,5,6
6,7,8,9
1,2,5
3,4,5""")
pd.read_csv(data, warn_bad_lines=True, error_bad_lines=False)
produces the warning:
Skipping line 4: expected 3 fields, saw 4
I'd like to store this output to a string so that I can eventually write it to a log file to keep track of records that are being skipped.
I tried using the warning module but it doesn't appear as though this "warning" is of the traditional sense. I'm using Python 2.7 and Pandas 0.16.
I think it isn't implemented to pandas.
source1, source2
My solutions:
1. Pre or after processing
import pandas as pd
import csv
df = pd.read_csv('data.csv', warn_bad_lines=True, error_bad_lines=False)
#compare length of rows by recommended value:
RECOMMENDED = 3
with open('data.csv') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
for row in reader:
if (len(row) != RECOMMENDED):
print ("Length of row is: %r" % len(row) )
print row
#compare length of rows by length of columns in df
lencols = len(df.columns)
print lencols
with open('data.csv') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
for row in reader:
if (len(row) != lencols):
print ("Length of row is: %r" % len(row) )
print row
2. Replaces sys.stdout
import pandas as pd
import os
import sys
class RedirectStdStreams(object):
def __init__(self, stdout=None, stderr=None):
self._stdout = stdout or sys.stdout
self._stderr = stderr or sys.stderr
def __enter__(self):
self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
self.old_stdout.flush(); self.old_stderr.flush()
sys.stdout, sys.stderr = self._stdout, self._stderr
def __exit__(self, exc_type, exc_value, traceback):
self._stdout.flush(); self._stderr.flush()
sys.stdout = self.old_stdout
sys.stderr = self.old_stderr
if __name__ == '__main__':
devnull = open('log.txt', 'w')
#replaces sys.stdout, sys.stderr, see http://stackoverflow.com/a/6796752/2901002
with RedirectStdStreams(stdout=devnull, stderr=devnull):
df = pd.read_csv('data.csv', warn_bad_lines=True, error_bad_lines=False)
I can't help you with older than Python 3, but I've had very good success with the following:
import pandas as pd
from contextlib import redirect_stderr
import io
# Redirect stderr to something we can report on.
f = io.StringIO()
with redirect_stderr(f):
df = pd.read_csv(
new_file_name, header=None, error_bad_lines=False, warn_bad_lines=True, dtype=header_types
)
if f.getvalue():
logger.warning("Had parsing errors: {}".format(f.getvalue()))
I searched for this issue a number of times and kept being pointed to this questions. Hope it helps someone else, later on.