Feed Rethinkdb with scrapy

Feed Rethinkdb with scrapy - python-2.7

I'm looking for a simple tutorial explaining how to write items to Rethinkdb from scrapy. The equivalent can be found for MongoDB here.

Here is a translation of "Write items to MongoDB" line for line with RethinkDB.
A couple notes:
I'm not sure where crawler.settings are set.
The scrapy docs say process_item's second param item can be an
object or dict, so the .insert(dict(item)) cast/conversion is probably necessary.
import rethinkdb as r
class RethinkDBPipeline(object):
table_name = 'scrapy_items'
def __init__(self, rethinkdb_uri, rethinkdb_port, rethinkdb_db):
self.rethinkdb_uri = rethinkdb_uri
self.rethinkdb_port = rethinkdb_port
self.rethinkdb_db = rethinkdb_db
#classmethod
def from_crawler(cls, crawler):
return cls(
rethinkdb_uri=crawler.settings.get('RETHINKDB_URI'),
rethinkdb_db=crawler.settings.get('RETHINKDB_DATABASE', 'items')
)
def open_spider(self, spider):
self.conn = r.connect(
host = self.rethinkdb_uri,
port = self.rethinkdb_port,
db = self.rethinkdb_db)
def close_spider(self, spider):
self.conn.close()
def process_item(self, item, spider):
r.table(self.table_name).insert(dict(item)).run(self.conn)
return item

Related

Django update index after adding records to db programmatically

I added a number of records to a Django db table (machina forums) directly via a script (ie: I did not use the site admin interface). The structure seemed fairly straightforward with no foreign keys in other tables.
However the resulting displays are uneven. In a forum index display all of the children forums display under a category. However if I go into the category, only forums added via the admin interface are visible. There does not appear to be any difference in the db records between those that were added programmatically and those added via the admin interface.
I am guessing the issue has to do with indexes on the table. However when I use a GUI to view the db all of the indexes show "result set is empty."
Any ideas about what is causing the problem and if it is index related, how do I update the index?
Here is the view that creates the forum displays:
Forum views
===========
This module defines views provided by the ``forum`` application.
"""
from django.http import HttpResponseRedirect
from django.shortcuts import get_object_or_404
from django.views.generic import ListView
from machina.apps.forum.signals import forum_viewed
from machina.conf import settings as machina_settings
from machina.core.db.models import get_model
from machina.core.loading import get_class
Forum = get_model('forum', 'Forum')
Topic = get_model('forum_conversation', 'Topic')
ForumVisibilityContentTree = get_class('forum.visibility', 'ForumVisibilityContentTree')
PermissionRequiredMixin = get_class('forum_permission.viewmixins', 'PermissionRequiredMixin')
TrackingHandler = get_class('forum_tracking.handler', 'TrackingHandler')
class IndexView(ListView):
""" Displays the top-level forums. """
context_object_name = 'forums'
template_name = 'forum/index.html'
def get_queryset(self):
""" Returns the list of items for this view. """
return ForumVisibilityContentTree.from_forums(
self.request.forum_permission_handler.forum_list_filter(
Forum.objects.all(), self.request.user,
),
)
def get_context_data(self, **kwargs):
""" Returns the context data to provide to the template. """
context = super(IndexView, self).get_context_data(**kwargs)
visiblity_content_tree = context['forums']
# Computes some global values.
context['total_posts_count'] = sum(n.posts_count for n in visiblity_content_tree.top_nodes)
context['total_topics_count'] = sum(
n.topics_count for n in visiblity_content_tree.top_nodes
)
return context
class ForumView(PermissionRequiredMixin, ListView):
""" Displays a forum and its topics. If applicable, its sub-forums can also be displayed. """
context_object_name = 'topics'
paginate_by = machina_settings.FORUM_TOPICS_NUMBER_PER_PAGE
permission_required = ['can_read_forum', ]
template_name = 'forum/forum_detail.html'
view_signal = forum_viewed
def get(self, request, **kwargs):
""" Handles GET requests. """
forum = self.get_forum()
if forum.is_link:
response = HttpResponseRedirect(forum.link)
else:
response = super(ForumView, self).get(request, **kwargs)
self.send_signal(request, response, forum)
return response
def get_forum(self):
""" Returns the forum to consider. """
if not hasattr(self, 'forum'):
self.forum = get_object_or_404(Forum, pk=self.kwargs['pk'])
return self.forum
def get_queryset(self):
""" Returns the list of items for this view. """
self.forum = self.get_forum()
qs = (
self.forum.topics
.exclude(type=Topic.TOPIC_ANNOUNCE)
.exclude(approved=False)
.select_related('poster', 'last_post', 'last_post__poster')
)
return qs
def get_controlled_object(self):
""" Returns the controlled object. """
return self.get_forum()
def get_context_data(self, **kwargs):
""" Returns the context data to provide to the template. """
context = super(ForumView, self).get_context_data(**kwargs)
# Insert the considered forum into the context
context['forum'] = self.get_forum()
# Get the list of forums that have the current forum as parent
context['sub_forums'] = ForumVisibilityContentTree.from_forums(
self.request.forum_permission_handler.forum_list_filter(
context['forum'].get_descendants(), self.request.user,
),
)
# The announces will be displayed on each page of the forum
context['announces'] = list(
self.get_forum()
.topics.select_related('poster', 'last_post', 'last_post__poster')
.filter(type=Topic.TOPIC_ANNOUNCE)
)
# Determines the topics that have not been read by the current user
context['unread_topics'] = TrackingHandler(self.request).get_unread_topics(
list(context[self.context_object_name]) + context['announces'], self.request.user,
)
return context
def send_signal(self, request, response, forum):
""" Sends the signal associated with the view. """
self.view_signal.send(
sender=self, forum=forum, user=request.user, request=request, response=response,
)

Seems like you're talking about some custom solution and it is hard to help without additional details like SQL queries that you've applied, model code, queries, and their returns.

Get Django CMS plugin information from Haystack search results

My search results page needs to display information about the Plugins where the query was found, too. I found this question with a similar problem, but I don't only need the contents, I need to know stuff about the plugin - i.e. what's it called, where it is on the page and stuff. Basically I would like a reference to the plugin where the query was located, but I can only find information about the page and title. I haven't been able to find it anywhere on the SearchQuerySet object and in the vicinity - also coming up empty in the documentation for Haystack. Is it possible and how?
Stack I'm using: Elasticsearch 2.4, django-haystack 2.8, aldryn-search 1.0 (for CMS indexing).

I ended up writing a new index for CMSPlugins. Not sure how much use my code is, but maybe it'll help someone out.
from django.conf import settings
from aldryn_search.helpers import get_plugin_index_data
from aldryn_search.utils import clean_join, get_index_base
from cms.models import CMSPlugin
class CMSPluginIndex(get_index_base()):
haystack_use_for_indexing = True
index_title = True
object_actions = ('publish', 'unpublish')
def get_model(self):
return CMSPlugin
def get_index_queryset(self, language):
return CMSPlugin.objects.select_related(
'placeholder'
).prefetch_related(
'placeholder__page_set'
).filter(
placeholder__page__publisher_is_draft=False,
language=language
).exclude(
plugin_type__in=settings.HAYSTACK_EXCLUDED_PLUGINS
).distinct()
def get_search_data(self, obj, language, request):
current_page = obj.placeholder.page
text_bits = []
plugin_text_content = self.get_plugin_search_text(obj, request)
text_bits.append(plugin_text_content)
page_meta_description = current_page.get_meta_description(fallback=False, language=language)
if page_meta_description:
text_bits.append(page_meta_description)
page_meta_keywords = getattr(current_page, 'get_meta_keywords', None)
if callable(page_meta_keywords):
text_bits.append(page_meta_keywords())
return clean_join(' ', text_bits)
def get_plugin_search_text(self, base_plugin, request):
plugin_content_bits = get_plugin_index_data(base_plugin, request)
return clean_join(' ', plugin_content_bits)
def prepare_pub_date(self, obj):
return obj.placeholder.page.publication_date
def prepare_login_required(self, obj):
return obj.placeholder.page.login_required
def get_url(self, obj):
parent_obj = self.ancestors_queryset(obj).first()
if not parent_obj:
return obj.placeholder.page.get_absolute_url()
return # however you get the URL in your project
def get_page_title_obj(self, obj):
return obj.placeholder.page.title_set.get(
publisher_is_draft=False,
language=obj.language
)
def ancestors_queryset(self, obj):
return obj.get_ancestors().filter(
plugin_type=# Some plugins that I wanted to find
).order_by(
'-depth'
)
def get_title(self, obj):
parent_obj = self.ancestors_queryset(obj).first()
if not parent_obj:
return self.get_page_title_obj(obj).title
return # get title from parent obj if you want to
def prepare_site_id(self, obj):
return obj.placeholder.page.node.site_id
def get_description(self, obj):
return self.get_page_title_obj(obj).meta_description or None

If you are using aldryn-search, you only need to define in PLACEHOLDERS_SEARCH_LIST all the placeholders you want to check, therefore all plugins inside will be checked:
PLACEHOLDERS_SEARCH_LIST = {
'*': {
'include': ['content'],
'exclude': [''],
},
}

How do we call a function each time an api end-point is called in django

In my Django server, there is an rest api through which we are saving the values in the database. If the name exists in the database then I update the value or else will create a new value and name. The code for the function is given below:
def getIgnitionData():
name_list =[]
value_list =[]
cursor = connections['ignition'].cursor()
cursor.execute('SELECT * FROM MDA_table')
value = cursor.fetchall()
cursor.execute('SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS where TABLE_NAME = \'MDA_table\'')
name = cursor.fetchall()
for i in name:
name_list.append(str(i[0]))
for row in value:
for j in row:
value_list.append(str(j))
cursor.close()
print name_list
print value
#Here we will check to see if the variable exists. If so, update the value. If not,
#then create a new variable.
for k in range(0,len(name_list)):
if (Ignition.objects.filter(name = name_list[k]).exists()):
Ignition.objects.filter(name=name_list[k]).update(value = value_list[k])
else:
Ignition.objects.create(name=name_list[k], value=value_list[k])
The view_api.py is as follows:
class IgnitionViewSet(viewsets.ModelViewSet):
"""
API endpoint that allows to view variables from the ignition database.
"""
serializer_class = IgnitionSerializer
#queryset = ignition.objects.all()
permission_classes = [HasGroupPermission]
required_groups = {
'GET': ['Admin', 'Facility', 'Operator'],
'PUT': [],
'POST': [],
}
ignition.getIgnitionData() # This is where we are calling the function
def get_queryset(self):
return Ignition.objects.all()
The code works well when I run the get request for the first time from the browser, but then if I update the values in the database without restarting the server then it doesn't even print the name_list (which means it doesn't call the code). If I restart the server and access the end point, then it does fetch the updated values. This is not practical though.
I wanted that whenever I call the api endpoint it fetches the updated values from the database so that I don't have to restart the server every time. Thanks in advance.

You can override dispatch() method which is calling each time your view is using:
class IgnitionViewSet(viewsets.ModelViewSet):
"""
API endpoint that allows to view variables from the ignition database.
"""
serializer_class = IgnitionSerializer
#queryset = ignition.objects.all()
permission_classes = [HasGroupPermission]
required_groups = {
'GET': ['Admin', 'Facility', 'Operator'],
'PUT': [],
'POST': [],
}
def dispatch(self, request, *args, **kwargs):
ignition.getIgnitionData() # This is where we are calling the function
return super(IgnitionViewSet, self).dispatch(request, *args, **kwargs)
def get_queryset(self):
return Ignition.objects.all()

django haysteck FacetedSearchView returning empty results?

I'm using Django haystack FacetedSearchView my views.py:
from haystack.generic_views import FacetedSearchView as BaseFacetedSearchView
class FacetedSearchView(BaseFacetedSearchView):
template_name = 'test.html'
facet_fields = ['source']
and in urls.py:
url(r'^search', FacetedSearchView.as_view(), name='haystack_search')
and in test.html I'm rendering the facets.
When I issue a request, the content of the factes context object is empty dict. but I think it should return all the I specified facets in facets_fields, and when I append q parameter to the request's querystring (with any value) it returns result but with zero document. is it neccessary to provide the q parameter? and with which value?

to solve the issue on need to override the search method of FacetedSearchForm, because the original implementation assumes a query 'q' but faceting needs only facet_fields to work.
from haystack.forms import FacetedSearchForm as BaseFacetedSearchForm
from haystack.generic_views import FacetedSearchView as BaseFacetedSearchView
class FacetedSearchForm(BaseFacetedSearchForm):
def __init__(self, *args, **kwargs):
self.selected_facets = kwargs.pop("selected_facets", [])
super(FacetedSearchForm, self).__init__(*args, **kwargs)
def search(self):
if not self.is_valid():
return self.no_query_found()
sqs = self.searchqueryset
# We need to process each facet to ensure that the field name and the
# value are quoted correctly and separately:
for facet in self.selected_facets:
if ":" not in facet:
continue
field, value = facet.split(":", 1)
if value:
sqs = sqs.narrow(u'%s:"%s"' % (field, sqs.query.clean(value)))
if self.load_all:
sqs = sqs.load_all()
return sqs
class FacetedSearchView(BaseFacetedSearchView):
template_name = 'facets.html'
facet_fields = []
form_class = FacetedSearchForm

How to access other methods in a django class?

I'm just learning Python and Django.
What I want to do is something like this
csvobject = CSVViewer(file)
rows = csvobject.get_row_count()
This is what I have so far. Remember this is all new to me so I'm looking for a an explanation. Thanks.
class CSVViewer:
def __init__(self, file=None):
self.file = file
def read_file(self):
data = []
file_read = csv.reader(self.file)
for row in file_read:
data.append(row)
return data
def get_row_count(self):
return len(read_file(self))
I am having problems with read_file(self)

Well, first of all, It seems you're missing import csv in order to csv.reader(self.file) works in the def read_line(self) method.
Second, you must call the instance method read_file like this self.read_file() in the get_row_count method. This should work:
import csv
class CSVViewer:
def __init__(self, file=None):
self.file = file
def read_file(self):
data = []
file_read = csv.reader([self.file])
for row in file_read:
data.append(row)
return data
def get_row_count(self):
return len(self.read_file())

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Feed Rethinkdb with scrapy - python-2.7

I'm looking for a simple tutorial explaining how to write items to Rethinkdb from scrapy. The equivalent can be found for MongoDB here.

Related

Django update index after adding records to db programmatically

Get Django CMS plugin information from Haystack search results

How do we call a function each time an api end-point is called in django

django haysteck FacetedSearchView returning empty results?

How to access other methods in a django class?

Categories

Resources