Why cant i append data to a newline in python 2.7? - python-2.7

This is the code that i have written :
import urllib2
import codecs
import urllib
import re
from bs4 import BeautifulSoup
from lxml.html import fromstring
import codecs
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
resp = urllib.urlopen(url)
respData =resp.read()
paras = re.findall(r'<p>(.*?)</p>',str(respData))
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "w+", encoding="utf-8") as f:
f.write(webpage_title)
soup = BeautifulSoup(htmltext,"lxml")
ut_container = soup.find("div", {"class": "ut-container"})
time = ut_container.find("none").text.strip()
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "a+",encoding="utf-8") as f:
f.write(time)
The output that is written to the file is :
IIT Bombay: Bird’s eye view and quantum biologyApril 22, 2017 18:56 IST
I want the output to be saved like this :
IIT Bombay: Bird’s eye view and quantum biology
April 22, 2017 18:56 IST

Since it is very general, I am just giving an idea for this context.
You need to just put a new line after writing webpage_title.
f.writelines(webpage_title)
f.write("\n")

I used windows style "\r\n".It works like a charm :
import urllib2
import codecs
import urllib
import re
from bs4 import BeautifulSoup
from lxml.html import fromstring
import codecs
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
resp = urllib.urlopen(url)
respData =resp.read()
paras = re.findall(r'<p>(.*?)</p>',str(respData))
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "w+", encoding="utf-8") as f:
f.write(webpage_title+"\r\n")
soup = BeautifulSoup(htmltext,"lxml")
ut_container = soup.find("div", {"class": "ut-container"})
time = ut_container.find("none").text.strip()
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "a+",encoding="utf-8") as f:
f.write(time)

Related

Pandas writes only the last line in a CSV File

i'm scraping urls from a txt file and export it to a csv file. But after all the process my code writes only the information from the last url. My guess is that i'm forgetting a loop. But where?
Here's my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
records = []
for pagetxt in final:
print pagetxt.text
records.append((pagetxt.text))
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')
Thanks
When you get data from file you keep only last value in variable final. Try to append data earlier (I've marked changes with #####):
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
records = [] ######
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
for pagetxt in final: ######
print pagetxt.text ######
records.append((pagetxt.text)) ######
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')

How do i extract title of this webpage in python 2.7 using bs4?

Currently i am using the code below :
import urllib
from bs4 import BeautifulSoup
import codecs
file_obj = open("E:\\Sport_Cricket.txt", "r+")
links_str = file_obj.readlines()
c=1
for j in links_str:
url=j.rstrip('\n')
if(url.endswith("ece")):
htmltext = urllib.urlopen(url).read()
soup = BeautifulSoup(htmltext,"lxml")
#title
webpage_title = soup.find_all("h1", attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Corpus\\Sport\\Cricket\\text"+str(c)+".txt", "w+", encoding="utf-8") as f:
f.writelines(webpage_title+"\r\n")
c=c+1
Sport_Cricket.txt contains :
http://www.thehindu.com/sport/cricket/unadkat-does-the-trick-for-pune/article18401543.ece
http://www.thehindu.com/sport/cricket/live-updates-delhi-daredevils-versus-mumbai-indians/article18400821.ece
http://www.thehindu.com/sport/cricket/old-guard-wants-pull-out-coa-warns-of-consequences/article18400811.ece
http://www.thehindu.com/sport/cricket/the-rise-of-sandeep-sharma/article18400700.ece
http://www.thehindu.com/sport/cricket/axar-has-found-his-mojo/article18400258.ece
I am getting the following error :
Traceback (most recent call last):
File "C:\Users\PJM\working_extractor_sorted_complete.py", line 31, in <module>
webpage_title = webpage_title[0].get_text(strip=True)
IndexError: list index out of range
Is there any alternate option instead of webpage_title = webpage_title[0].text(strip=True)???
Pankaj,
Use this instead to get the title using BeautifulSoup.
webpage_title = soup.title.string
This will get the first title element anywhere in the html document.

How do i convert from unicode to string and write it to a file in python 2.7?

import urllib
import re
from bs4 import BeautifulSoup
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
f_obj=open("E:\\Crawler_paras_sorted_test_webpages_title.txt","w+")
file_obj.write(webpage_title)
file_obj.close
It gives me following error:
Traceback (most recent call last):
File "C:\Users\PJM\working_extractor_sorted_title.py", line 37, in <module>
file_obj.write(webpage_title)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u2019' in position 16: ordinal not in range(128)
How do I fix this now ?I want it saved as string in the file.
You need to encode the string, which contains the unicode character point \u2019, try this:
file_obj=open("E:\\Crawler_paras_sorted_test_webpages_title.txt","w+")
file_obj.write(webpage_title.encode('utf-8'))
file_obj.close() # <-- you were missing the ()
Finally figured it out on my own. Here is the code:
import urllib
import re
import codecs
from bs4 import BeautifulSoup
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_title_"+str(c)+".txt", "w+", encoding="utf-8") as f:
f.write(webpage_title)
Codecs library really did help.
Python 2.7.12+ (default, Sep 17 2016, 12:08:02)
Type "copyright", "credits" or "license" for more information.
To solve your problem just use:
f_obj.write(webpage_title.encode('ascii', 'ignore'))
In [8]: import urllib
In [8]: import re
In [9]: from bs4 import BeautifulSoup
In [10]: url = 'http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece'
In [11]: htmltext = urllib.urlopen(url)
In [12]: soup = BeautifulSoup(htmltext,'lxml')
In [13]: webpage_title = soup.find_all('h1',attrs={'class':'title'})
In [14]: print webpage_title
[\nIIT Bombay: Bird\u2019s eye view and quantum biology\n]
In [15]: webpage_title = webpage_title[0].get_text(strip=True)
In [16]: print webpage_title
IIT Bombay: Bird’s eye view and quantum biology
In [17]: f_obj = open('Crawler_paras_sorted_test_webpages_title.txt', 'w')
In [18]: f_obj.write(webpage_title)
UnicodeEncodeError Traceback (most recent call last)
in ()
----> 1 f_obj.write(webpage_title)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u2019' in position 16: ordinal not in range(128)
In [19]: f_obj.write(webpage_title.encode("utf8"))
In [21]: f_obj.close()
ok,sovle this using:
webpages_title.encode("utf8")

Python fetch data from website

Below python code not working to fetch the data from given link. Please help me how to make it possible
import urllib2
from bs4 import BeautifulSoup
quote_page = 'http://www.smartvidya.co.in/2016/11/ugc-net-paper-1-previous-year-questions_14.html'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('div', attrs={'class': 'MsoNormal'})
print name_box
Try this :
import urllib2
from bs4 import BeautifulSoup
quote_page = 'http://www.smartvidya.co.in/2016/11/ugc-net-paper-1-previous-year-questions_14.html'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
for name_box in soup.findAll('div',attrs={'class': 'MsoNormal'}):
print name_box.text
Hope this helps!

How can I get all the software link?

I have this code:
import urllib
import urlparse
from bs4 import BeautifulSoup
url = "http://www.downloadcrew.com/?act=search&cat=51"
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
pass
But when I run it, I only get 20 links only. The output should be more than 20 links.
Because you only download the first page of content.
Just use a loop to donwload all pages:
import urllib
import urlparse
from bs4 import BeautifulSoup
for i in xrange(3):
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if you do'nt know the count of pages, you can
import urllib
import urlparse
from bs4 import BeautifulSoup
i = 0
while 1:
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
has_more = 0
for a in soup.select("div.productListingTitle a[href]"):
has_more = 1
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if has_more:
i += 1
else:
break
I run it on my computer and it get 60 link of three pages.
Good luck~