This is the code that i have written :
import urllib2
import codecs
import urllib
import re
from bs4 import BeautifulSoup
from lxml.html import fromstring
import codecs
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
resp = urllib.urlopen(url)
respData =resp.read()
paras = re.findall(r'<p>(.*?)</p>',str(respData))
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "w+", encoding="utf-8") as f:
f.write(webpage_title)
soup = BeautifulSoup(htmltext,"lxml")
ut_container = soup.find("div", {"class": "ut-container"})
time = ut_container.find("none").text.strip()
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "a+",encoding="utf-8") as f:
f.write(time)
The output that is written to the file is :
IIT Bombay: Bird’s eye view and quantum biologyApril 22, 2017 18:56 IST
I want the output to be saved like this :
IIT Bombay: Bird’s eye view and quantum biology
April 22, 2017 18:56 IST
Since it is very general, I am just giving an idea for this context.
You need to just put a new line after writing webpage_title.
f.writelines(webpage_title)
f.write("\n")
I used windows style "\r\n".It works like a charm :
import urllib2
import codecs
import urllib
import re
from bs4 import BeautifulSoup
from lxml.html import fromstring
import codecs
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
resp = urllib.urlopen(url)
respData =resp.read()
paras = re.findall(r'<p>(.*?)</p>',str(respData))
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "w+", encoding="utf-8") as f:
f.write(webpage_title+"\r\n")
soup = BeautifulSoup(htmltext,"lxml")
ut_container = soup.find("div", {"class": "ut-container"})
time = ut_container.find("none").text.strip()
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_complete.txt", "a+",encoding="utf-8") as f:
f.write(time)
Related
i'm scraping urls from a txt file and export it to a csv file. But after all the process my code writes only the information from the last url. My guess is that i'm forgetting a loop. But where?
Here's my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
records = []
for pagetxt in final:
print pagetxt.text
records.append((pagetxt.text))
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')
Thanks
When you get data from file you keep only last value in variable final. Try to append data earlier (I've marked changes with #####):
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
records = [] ######
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
for pagetxt in final: ######
print pagetxt.text ######
records.append((pagetxt.text)) ######
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')
Currently i am using the code below :
import urllib
from bs4 import BeautifulSoup
import codecs
file_obj = open("E:\\Sport_Cricket.txt", "r+")
links_str = file_obj.readlines()
c=1
for j in links_str:
url=j.rstrip('\n')
if(url.endswith("ece")):
htmltext = urllib.urlopen(url).read()
soup = BeautifulSoup(htmltext,"lxml")
#title
webpage_title = soup.find_all("h1", attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Corpus\\Sport\\Cricket\\text"+str(c)+".txt", "w+", encoding="utf-8") as f:
f.writelines(webpage_title+"\r\n")
c=c+1
Sport_Cricket.txt contains :
http://www.thehindu.com/sport/cricket/unadkat-does-the-trick-for-pune/article18401543.ece
http://www.thehindu.com/sport/cricket/live-updates-delhi-daredevils-versus-mumbai-indians/article18400821.ece
http://www.thehindu.com/sport/cricket/old-guard-wants-pull-out-coa-warns-of-consequences/article18400811.ece
http://www.thehindu.com/sport/cricket/the-rise-of-sandeep-sharma/article18400700.ece
http://www.thehindu.com/sport/cricket/axar-has-found-his-mojo/article18400258.ece
I am getting the following error :
Traceback (most recent call last):
File "C:\Users\PJM\working_extractor_sorted_complete.py", line 31, in <module>
webpage_title = webpage_title[0].get_text(strip=True)
IndexError: list index out of range
Is there any alternate option instead of webpage_title = webpage_title[0].text(strip=True)???
Pankaj,
Use this instead to get the title using BeautifulSoup.
webpage_title = soup.title.string
This will get the first title element anywhere in the html document.
import urllib
import re
from bs4 import BeautifulSoup
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
f_obj=open("E:\\Crawler_paras_sorted_test_webpages_title.txt","w+")
file_obj.write(webpage_title)
file_obj.close
It gives me following error:
Traceback (most recent call last):
File "C:\Users\PJM\working_extractor_sorted_title.py", line 37, in <module>
file_obj.write(webpage_title)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u2019' in position 16: ordinal not in range(128)
How do I fix this now ?I want it saved as string in the file.
You need to encode the string, which contains the unicode character point \u2019, try this:
file_obj=open("E:\\Crawler_paras_sorted_test_webpages_title.txt","w+")
file_obj.write(webpage_title.encode('utf-8'))
file_obj.close() # <-- you were missing the ()
Finally figured it out on my own. Here is the code:
import urllib
import re
import codecs
from bs4 import BeautifulSoup
url="http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece"
htmltext = urllib.urlopen(url).read()
soup = BeautifulSoup(htmltext,"lxml")
webpage_title = soup.find_all('h1', attrs = {"class": "title"})
webpage_title = webpage_title[0].get_text(strip=True)
with codecs.open("E:\\Crawler_paras_sorted_test_webpages_title_"+str(c)+".txt", "w+", encoding="utf-8") as f:
f.write(webpage_title)
Codecs library really did help.
Python 2.7.12+ (default, Sep 17 2016, 12:08:02)
Type "copyright", "credits" or "license" for more information.
To solve your problem just use:
f_obj.write(webpage_title.encode('ascii', 'ignore'))
In [8]: import urllib
In [8]: import re
In [9]: from bs4 import BeautifulSoup
In [10]: url = 'http://www.thehindu.com/sci-tech/science/iit-bombay-birds-eye-view-and-quantum-biology/article18191268.ece'
In [11]: htmltext = urllib.urlopen(url)
In [12]: soup = BeautifulSoup(htmltext,'lxml')
In [13]: webpage_title = soup.find_all('h1',attrs={'class':'title'})
In [14]: print webpage_title
[\nIIT Bombay: Bird\u2019s eye view and quantum biology\n]
In [15]: webpage_title = webpage_title[0].get_text(strip=True)
In [16]: print webpage_title
IIT Bombay: Bird’s eye view and quantum biology
In [17]: f_obj = open('Crawler_paras_sorted_test_webpages_title.txt', 'w')
In [18]: f_obj.write(webpage_title)
UnicodeEncodeError Traceback (most recent call last)
in ()
----> 1 f_obj.write(webpage_title)
UnicodeEncodeError: 'ascii' codec can't encode character u'\u2019' in position 16: ordinal not in range(128)
In [19]: f_obj.write(webpage_title.encode("utf8"))
In [21]: f_obj.close()
ok,sovle this using:
webpages_title.encode("utf8")
Below python code not working to fetch the data from given link. Please help me how to make it possible
import urllib2
from bs4 import BeautifulSoup
quote_page = 'http://www.smartvidya.co.in/2016/11/ugc-net-paper-1-previous-year-questions_14.html'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('div', attrs={'class': 'MsoNormal'})
print name_box
Try this :
import urllib2
from bs4 import BeautifulSoup
quote_page = 'http://www.smartvidya.co.in/2016/11/ugc-net-paper-1-previous-year-questions_14.html'
page = urllib2.urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')
for name_box in soup.findAll('div',attrs={'class': 'MsoNormal'}):
print name_box.text
Hope this helps!
I have this code:
import urllib
import urlparse
from bs4 import BeautifulSoup
url = "http://www.downloadcrew.com/?act=search&cat=51"
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
pass
But when I run it, I only get 20 links only. The output should be more than 20 links.
Because you only download the first page of content.
Just use a loop to donwload all pages:
import urllib
import urlparse
from bs4 import BeautifulSoup
for i in xrange(3):
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if you do'nt know the count of pages, you can
import urllib
import urlparse
from bs4 import BeautifulSoup
i = 0
while 1:
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
has_more = 0
for a in soup.select("div.productListingTitle a[href]"):
has_more = 1
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if has_more:
i += 1
else:
break
I run it on my computer and it get 60 link of three pages.
Good luck~