blank list error when taking data from web site - python-2.7

I have a problem on API. Its turns to me empty list
I tried to search browser but none is my answer.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib import re
site = "http://www.hurriyet.com.tr"
regex = "<span class='news-title'>(.+?)</span>"
comp = re.compile(regex)
print(comp) print(regex)
htmlkod = urllib.urlopen(site).read()
titles = re.findall(regex, htmlkod)
print(titles)
i=1
for title in titles:
print str(i), title.decode("iso8859-9")
print(title)
i+=1
I Expect the its turn to me news titles but its turn me "[]" empty list

I recommend to use BeautifulSoup instead of regex like :
from urllib import urlopen
from bs4 import BeautifulSoup
site = "http://www.hurriyet.com.tr"
openurl = urlopen(site)
soup = BeautifulSoup(openurl, "html.parser")
getTitle = soup.findAll('span', attrs={'class': 'news-title'})
for title in getTitle:
print title.text

Related

How to Run Multiples URLs in Requests from a File

I'm trying to scrap multiples websites from URLs in a txt file. There's one url per line.
my code is:
Import requests
from bs4 import BeautifulSoup
file = open('url.txt', 'r')
filelines = file.readline()
urllist = requests.get(filelines)
soup = BeautifulSoup(urllist.content, "html.parser")
content = soup.find_all("span", {"class": "title-main-info"})
print content
But it prints only the last url content (last line). What i'm doing wrong?
Thanks
Try this. It should work:
import requests
from bs4 import BeautifulSoup
with open('url.txt', 'r') as f:
for links in f.readlines():
urllist= requests.get(links.strip())
soup = BeautifulSoup(urllist.content, "html.parser")
content = soup.find_all("span", {"class": "title-main-info"})
print content

Where to put scraper script in django so as to access django database

I have a script which populates a database with initial data(football matches) from a web scraper..Currently I have put it under init.py but looks and sounds wrong..Here is my code in init.py..Where should I put this code so that it updates the matches field in my database?..Kindly help
from .models import Betting
import re
import requests
import urllib
from bs4 import BeautifulSoup
urls= "https://sms.betyetu.co.ke/tomorrow.html"
htmlfile = urllib.urlopen(urls)
htmltext = htmlfile.read()
soup = BeautifulSoup(htmltext, "html.parser")
for i in soup.findAll("table"):
try:
momo = 1
a = len( i.contents[3].contents)
while momo <= a:
foo = Betting(matches=i.contents[3].contents[momo].findAll("td")[2].text)
momo += 2
**strong text** foo.save()
print i.contents[3].contents[3].findAll("td")[0].text
except:
momo = 1

How to crawl latest articles in a specific site using specific set keyword?

I am trying a python code for crawling article links on specific sites based on key word like name of the article.but i didn't get the links appropriate links.
import sys
import requests
from bs4 import BeautifulSoup
import urllib.request
from urlparse import urlparse
def extract_article_links(url,data):
req = urllib.request.Request(url,data)
response = urllib.request.urlopen(req)
responseData = response.read()
#r = requests.get(url)
soup = BeautifulSoup(responseData.content)
links = soup.find_all('a')
for link in links:
try:
#if 'http' in link:
print ("<a href='%s'>%s</a>" % (link.get('href'),link.text))
except Exception as e :
print (e)
responseData = soup.find_all("div",{"class:info"})
print responseData
for item in responseData:
print (item.contents[0].text)
print (item.contents[1].text)
if __name__ == "__main__":
from sys import argv
if (len(argv)<2):
print"Insufficient arguments..!!"
sys.exit(1)
url = sys.argv[1]
values = {'s':'article','submit':'search'}
data = urlparse.urlencode(values)
data = data.encode('utf-8')
extract_article_links(url,data)
Try lxml, analyze the html and locate elements you are looking for, then you can do this easily with xpath :
from lxml import html
print map (lambda link: link, html.fromstring(source).xpath('//a/#href'))
of course you need to modify the xpath according to the attribute you are looking for.
try this
import requests
from bs4 import BeautifulSoup
def extract_article_links(url,data):
soup = BeautifulSoup(requests.get('http://www.hindustantimes.com/Search/search.aspx?q={}&op=All&pt=all&auth=all'.format(data)).content)
responseData = soup.find("ul",{'class':'searchNews'})
_a, _li = responseData.find_all('a'), responseData.find_all('li')
for i,j in zip(_a,_li):
print '='*40,'\nLink: ',i['href'], '\nTitle: ',i.contents[0], '\nContent: \n\t', j.p.get_text(),'\n'
if __name__ == "__main__":
url = "http://www.hindustantimes.com/"
extract_article_links(url,'article')

scrapy re.match not working find urls in string using regex

I try crawl many url in the same domain. I have to url list in the string. I want to search regex in string and find urls. But re.match() always return none. I test my regex and it working. This is my code:
# -*- coding: UTF-8 -*-
import scrapy
import codecs
import re
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import Request
from scrapy.selector import HtmlXPathSelector
from hurriyet.items import HurriyetItem
class hurriyet_spider(CrawlSpider):
name = 'hurriyet'
allowed_domains = ['hurriyet.com.tr']
start_urls = ['http://www.hurriyet.com.tr/gundem/']
rules = (Rule(SgmlLinkExtractor(allow=('\/gundem(\/\S*)?.asp$')),'parse',follow=True),)
def parse(self, response):
image = HurriyetItem()
text = response.xpath("//a/#href").extract()
print text
urls = ''.join(text)
page_links = re.match("(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’]))", urls, re.M)
image['title'] = response.xpath("//h1[#class = 'title selectionShareable'] | //h1[#itemprop = 'name']/text()").extract()
image['body'] = response.xpath("//div[#class = 'detailSpot']").extract()
image['body2'] = response.xpath("//div[#class = 'ctx_content'] ").extract()
print page_links
return image, text
There is no need to use the re module, Scrapy selectors have a built in feature for regex filtering:
def parse(self, response):
...
page_links = response.xpath("//a/#href").re('your_regex_expression')
...
With that said, I suggest you play with this approach in the Scrapy shell first to make sure your regex is indeed working. Because I wouldn't expect people to try to debug a mile long regex - it's basically a write only language :)

How can I get all the software link?

I have this code:
import urllib
import urlparse
from bs4 import BeautifulSoup
url = "http://www.downloadcrew.com/?act=search&cat=51"
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
pass
But when I run it, I only get 20 links only. The output should be more than 20 links.
Because you only download the first page of content.
Just use a loop to donwload all pages:
import urllib
import urlparse
from bs4 import BeautifulSoup
for i in xrange(3):
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
for a in soup.select("div.productListingTitle a[href]"):
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if you do'nt know the count of pages, you can
import urllib
import urlparse
from bs4 import BeautifulSoup
i = 0
while 1:
url = "http://www.downloadcrew.com/?act=search&page=%d&cat=51" % i
pageHtml = urllib.urlopen(url)
soup = BeautifulSoup(pageHtml)
has_more = 0
for a in soup.select("div.productListingTitle a[href]"):
has_more = 1
try:
print (a["href"]).encode("utf-8","replace")
except:
print "no link"
if has_more:
i += 1
else:
break
I run it on my computer and it get 60 link of three pages.
Good luck~