Haystack + solr duplicate on update - django

I'm new to haystack/solr so this is likely a newbie error. I am using solr with haystack.
When I run update_index, it seems to be duplicating the records. I am getting:
get() returned more than one Doctor -- it returned 3!
for this piece of code:
self._object = self.searchindex.read_queryset().get(pk=self.pk)
if I run update_index again, the number return increases by one and if I run rebuild_index, it will work showing only one record until I update again.
So from that, It seems that update_index is duplicating records in the index. How do I get it from not doing that?
Here is my haystack search index:
from haystack import indexes
from .models import Doctor, Zipcode
from django.contrib.gis.measure import D
from django.conf import settings
class DoctorIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.EdgeNgramField(document=True, use_template=True)
name = indexes.EdgeNgramField(model_attr='name')
specialty = indexes.MultiValueField()
condition = indexes.MultiValueField()
procedure = indexes.MultiValueField()
premium = indexes.BooleanField()
location = indexes.LocationField(model_attr='main_office__location')
latitude = indexes.DecimalField(indexed=False)
longitude = indexes.DecimalField(indexed=False)
docid = indexes.IntegerField()
slugify_name = indexes.CharField(indexed=False)
rendered = indexes.CharField(use_template=True, indexed=False)
premium_rendered = indexes.CharField(use_template=True, indexed=False)
include = indexes.BooleanField(indexed=False)
def get_model(self):
return Doctor
def prepare_specialty(self, obj):
return ["%s %s"%((specialty.parent.name if specialty.parent else ""), specialty.name) for specialty in obj.specialty.all()]
def prepare_condition(self, obj):
return [condition.name for condition in obj.conditions.all()]
def prepare_procedure(self, obj):
return [procedure.name for procedure in obj.procedures.all()]
def prepare_premium(self, obj):
return obj.display()['premium']
def prepare_latitude(self, obj):
return obj.main_office.lat
def prepare_longitude(self, obj):
return obj.main_office.lon
def prepare_docid(self,obj):
return obj.id
def prepare_slugify_name(self,obj):
return obj.slugify_name()
def index_queryset(self, using=None):
"""Used when the entire index for model is updated."""
return self.get_model().objects.filter(specialty__search_include=True)
Here is my solr schema: https://gist.github.com/anonymous/5d5b011ca7fa0f3f3e29
I've done a lot of googling, but can't seem to find an answer to this.

So this one was tricky to track down, but the problem was actually in my index_queryset function.
This:
return self.get_model().objects.filter(specialty__search_include=True)
should actually be this:
return self.get_model().objects.filter(specialty__search_include=True).distinct()
That function had duplicates in it and was causing my error, not the solr schema like I had thought. Specialty is a ManyToManyField.

I just faced the same issue.
According to this topic it is necessary to remove .pyc files. Inside of a project simply do the next (for Linux):
find . -name "*.pyc" -type f -delete

Related

Get Django CMS plugin information from Haystack search results

My search results page needs to display information about the Plugins where the query was found, too. I found this question with a similar problem, but I don't only need the contents, I need to know stuff about the plugin - i.e. what's it called, where it is on the page and stuff. Basically I would like a reference to the plugin where the query was located, but I can only find information about the page and title. I haven't been able to find it anywhere on the SearchQuerySet object and in the vicinity - also coming up empty in the documentation for Haystack. Is it possible and how?
Stack I'm using: Elasticsearch 2.4, django-haystack 2.8, aldryn-search 1.0 (for CMS indexing).
I ended up writing a new index for CMSPlugins. Not sure how much use my code is, but maybe it'll help someone out.
from django.conf import settings
from aldryn_search.helpers import get_plugin_index_data
from aldryn_search.utils import clean_join, get_index_base
from cms.models import CMSPlugin
class CMSPluginIndex(get_index_base()):
haystack_use_for_indexing = True
index_title = True
object_actions = ('publish', 'unpublish')
def get_model(self):
return CMSPlugin
def get_index_queryset(self, language):
return CMSPlugin.objects.select_related(
'placeholder'
).prefetch_related(
'placeholder__page_set'
).filter(
placeholder__page__publisher_is_draft=False,
language=language
).exclude(
plugin_type__in=settings.HAYSTACK_EXCLUDED_PLUGINS
).distinct()
def get_search_data(self, obj, language, request):
current_page = obj.placeholder.page
text_bits = []
plugin_text_content = self.get_plugin_search_text(obj, request)
text_bits.append(plugin_text_content)
page_meta_description = current_page.get_meta_description(fallback=False, language=language)
if page_meta_description:
text_bits.append(page_meta_description)
page_meta_keywords = getattr(current_page, 'get_meta_keywords', None)
if callable(page_meta_keywords):
text_bits.append(page_meta_keywords())
return clean_join(' ', text_bits)
def get_plugin_search_text(self, base_plugin, request):
plugin_content_bits = get_plugin_index_data(base_plugin, request)
return clean_join(' ', plugin_content_bits)
def prepare_pub_date(self, obj):
return obj.placeholder.page.publication_date
def prepare_login_required(self, obj):
return obj.placeholder.page.login_required
def get_url(self, obj):
parent_obj = self.ancestors_queryset(obj).first()
if not parent_obj:
return obj.placeholder.page.get_absolute_url()
return # however you get the URL in your project
def get_page_title_obj(self, obj):
return obj.placeholder.page.title_set.get(
publisher_is_draft=False,
language=obj.language
)
def ancestors_queryset(self, obj):
return obj.get_ancestors().filter(
plugin_type=# Some plugins that I wanted to find
).order_by(
'-depth'
)
def get_title(self, obj):
parent_obj = self.ancestors_queryset(obj).first()
if not parent_obj:
return self.get_page_title_obj(obj).title
return # get title from parent obj if you want to
def prepare_site_id(self, obj):
return obj.placeholder.page.node.site_id
def get_description(self, obj):
return self.get_page_title_obj(obj).meta_description or None
If you are using aldryn-search, you only need to define in PLACEHOLDERS_SEARCH_LIST all the placeholders you want to check, therefore all plugins inside will be checked:
PLACEHOLDERS_SEARCH_LIST = {
'*': {
'include': ['content'],
'exclude': [''],
},
}

one to many latest query in a filter/exclude

If I have:
class Info(Model):
...
class Ad(Model):
listed_date = DatetimeField()
info = ForeignKey('Info', related_name='ads', null=True)
....
I want to query Info based on fields within Ad, but only the latest ad. I know I can do:
Ad.objects.latest('listed_date')
But since I will be building up the query by chaining several filter/excludes together, I want something like:
query = query.filter(
Q(**{
'ads__latest__'+attr_name: query_value
})
)
Or perhaps even have a field 'latest_ad' which always points to the most recent based on a certain field. The goal is to be able to query just the latest in the related field in a built up filter/exclude method.
How can I do this?
EDIT:
A little background...
I have 2 models (LegalAd, TrusteeInfo) that store scraped data about the same auction item, some of the field need a fair deal of processing to extract the necessary values (hence my decision to store the information in separate models) store the data at different stages of processing. I then attempt to combine both models into one (AuctionItem), and use properties extensively to prioritze data from TrusteeInfo over LegalAd for the similar fields they share. The problem is that I would like to query those fields, which the use of properties prohibits. So I created a manager and overrode the filter and exclude methods to hold the prioritization logic. Below is the code:
class LegalAd(models.Model):
listed_date = models.DateField(null=True) # field I would like to use for latest query
auction = models.ForeignKey('auction_table.Auction', related_name='legal_ads', null=True)
...
class TrusteeInfo(models.Model):
auction = models.OneToOneField('auction_table.Auction', null=True)
...
class AuctionManager(models.Manager):
def do_query_action(self, action, kwargs):
trusteeinfo = apps.get_model('scrapers', 'TrusteeInfo')
trustee_fields = [field.name for field in trusteeinfo._meta.get_fields()]
legalad = apps.get_model('scrapers', 'LegalAd')
legalad_fields = [field.name for field in legalad._meta.get_fields()]
related_fields = trustee_fields + legalad_fields
auction_native_fields = [
'legal_ads',
'trusteeinfo',
'properties',
'id',
'pk',
'created_date',
'updated_date'
]
query = super(AuctionManager, self)
for attr, value in kwargs.items():
attr_base = attr.split('__')[0] # get the base attr name
if attr_base in auction_native_fields:
query = getattr(query, action)(**{attr: value})
elif attr_base in related_fields:
qs = []
if attr_base in trustee_fields:
trustee_attr_name = 'trusteeinfo__' + attr
qs.append(Q(**{trustee_attr_name: value}))
if attr_base in legalad_fields:
legalad_attr_name = 'legalads__' + attr
qs.append(Q(**{legalad_attr_name: value}))
query = getattr(query, action)(reduce(or_, qs))
else:
raise AttributeError("type object `Auction` has no attribute '{attr}'".format(attr=attr))
return query.distinct()
def filter(self, **kwargs):
return self.do_query_action('filter', kwargs)
def exclude(self, **kwargs):
return self.do_query_action('exclude', kwargs)
class Auction(models.Model):
objects = AuctionManager()
created_date = models.DateTimeField(auto_now_add=True)
updated_date = models.DateTimeField(auto_now=True)
#property
def latest_ad(self):
return self.legal_ads.exists() and self.legal_ads.latest('listed_date')
#property
def sale_datetime(self):
if self.trusteeinfo and self.trusteeinfo.sale_datetime:
return self.trusteeinfo.sale_datetime
else:
return self.latest_ad and self.latest_ad.sale_datetime
#property
def county(self):
if self.trusteeinfo and self.trusteeinfo.county:
return self.trusteeinfo.county
else:
return self.latest_ad and self.latest_ad.county
#property
def sale_location(self):
return self.latest_ad and self.latest_ad.sale_address
#property
def property_addresses(self):
if self.trusteeinfo and self.trusteeinfo.parsed_addresses.exists():
return self.trusteeinfo.parsed_addresses
else:
return self.latest_ad and self.latest_ad.parsed_addresses
#property
def raw_addresses(self):
if self.trusteeinfo and self.trusteeinfo.addresses:
return self.trusteeinfo.addresses
else:
return self.latest_ad and self.latest_ad.addresses.get('addresses', None)
#property
def parcel_numbers(self):
return self.latest_ad and self.latest_ad.parcel_numbers
#property
def trustee(self):
if self.trusteeinfo:
return self.trusteeinfo.trustee
else:
return self.latest_ad and self.latest_ad.trustee.get('trustee', None)
#property
def opening_bid(self):
if self.trusteeinfo and self.trusteeinfo.opening_bid:
return self.trusteeinfo.opening_bid
else:
return self.latest_ad and self.latest_ad.dollar_amounts.get('bid_owed', [[None]])[0][0]
#property
def deposit_amount(self):
if self.trusteeinfo and self.trusteeinfo.deposit_amount:
return self.trusteeinfo.deposit_amount
else:
return self.latest_ad and self.latest_ad.dollar_amounts.get('deposit', [[None]])[0][0]
#property
def sale_status(self):
return self.trusteeinfo and self.trusteeinfo.sale_status
#property
def trustors(self):
if self.trusteeinfo and self.trusteeinfo.parsed_names.exists():
return self.trusteeinfo.parsed_names
else:
return self.latest_ad and self.latest_ad.parsed_names
It gets a bit more complicated with the fact that the ads are usually listed 2 at a time so there is a good chance of 2 ads showing up for the latest date, meaning I would have to run something like a first() method on it too. I could look out for certain kwargs and run a special query for that but how would I incorporate that into the the rest of the kwargs in the chained query? Ideally, if I could keep the one to many legal_ads, but also be able to do something like:
query.filter(latest_ad__<queryfield>=value)
or:
query.filter(legal_ads__latest__<queryfield>=value)
That would be great.
What you have is the so called greatest-n-per-group problem, its hard to deal with or even impossible with the ORM.
One way to approach the problem can be found here.
In your case it could be something like this:
Info.objects.filter(
ad__listed_date__in=Info.objects.annotate(
last_date=Max('ad__listed_date')
).values_list('last_date', flat=True)
#now you can add more
#ad__<somefiled> statements
#but you need to make it in a single `.filter` call
#otherwise the ORM will do separate joins per `.filter` call
)
I personally don't like this. It's looks like a hack to me, its not very efficient and it can very easy return bad results if a penultimate ad in some group have an equal listed_date to a last ad in another group.
Workarounds
If you give us some more background about why do you need to filter on the latest_ad per info, maybe we could find another way to get the same/similar results.
However, one workaround which I prefer, is to filter on some date_range. For example, don't search for the latest_ad, but .filter on the latest_ads in last_day or two or a week, depending on your needs. Its pretty easy and efficient (easy to optimize) query.
Info.objects.filter(
ad__listed_date__gte=(today-timedelta(days=1))
#now again you can keep adding more `ad__<somefiled>` statements
#but make sure to enclose them in a single `.filter` call.
)
You also mention a good workaround, if you can easy keep up-to-date Info.latest_ad field, then I guess you will be good to go.
If you go for that approach make sure to set on_delete=models.SET_NULL because the default behavior (cascade delete) can bring you problems.
class Info(Model):
#...
#related_name='+' prevents creating a reverse relation
#on_delete=models.SET_NULL prevents deleting the Info object when its latest ad got deleted.
latest_ad = models.ForeignKey('Ad',
related_name='+',
blank=True,
null=True,
on_delete=models.SET_NULL
)
You can use .latest() along .filter()
Ad.objects.filter(your_filter=your_value).latest('listed_date')
or using oder_by
Ad.objects.filter(your_filter=your_value).order_by('-listed_date')[0]

Changing ordering of results in django-haystack

I am using django-haystack for search. By default it is showing oldest objects first whereas i want to show latest on top. Can anyone guide me how can i do this?
My code sample is shown below:
search_indexes.py
class PostIndex(indexes.RealTimeSearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
title = indexes.CharField(model_attr='title')
created = indexes.DateTimeField(model_attr='created')
def get_model(self):
return Post
def index_queryset(self):
return self.get_model().objects.filter(created__lte=datetime.datetime.now())
Can anyone tell me the exact file where i'll have to make changes and what changes? Do i have to make changes to query.py file in haystack? Query.py has
def order_by(self, *args):
"""Alters the order in which the results should appear."""
clone = self._clone()
for field in args:
clone.query.add_order_by(field)
return clone
How can i make changes to this to show the latest on the top?
You can set order_by in your haystack_urls.py (or whatever you called it) e.g
qs = SearchQuerySet().order_by('-created')
urlpatterns = patterns('haystack.views',
url(r'^$', SearchView(searchqueryset=qs), name='haystack_search'),
)

Django haystack search returning items that are excluded

I have been having some issues with django-haystack and need some help.
I run a site that indexes projects and certain projects are in a status where they should not be seen, ie status='DE', status='PR'
my current setup is.
from haystack.indexes import *
from haystack import site
from models import Project
class ProjectIndex(RealTimeSearchIndex):
project_name = CharField(document=True, use_template=True)
description = CharField(use_template=True, model_attr='description')
location = CharField(use_template=True, model_attr='location')
owner = CharField(model_attr='owner')
def search(self):
return Project.objects.filter(status='AP').exclude(status='PR').exclude(status='DE')
def index_queryset(self):
"""Used when the entire index for model is updated."""
return Project.objects.filter(status='AP').exclude(status='PR').exclude(status='DE')
def get_queryset(self):
"""Used when the entire index for model is updated."""
return Project.objects.filter(status='AP').exclude(status='PR').exclude(status='DE')
def read_queryset(self):
"""Used when the entire index for model is updated."""
return Project.objects.filter(status='AP').exclude(status='PR').exclude(status='DE')
site.register(Project, ProjectIndex)
I managed to solve this issue by updating from 1.1 to 1.2
then all of the sudden I started receiving these Caught VariableDoesNotExist while rendering: Failed lookup for key [object] in u'None'
Googled it and found out that certain items might have disappeared out of the system and there is a handy command for that.
now I have a cronjob that does the following /usr/bin/python2.6 /www/mysite/manage.py update_index --remove every few hours

Django: How to make a query for on object based on an M2M field (multiple selections for field on search form)

I need help coming up with an efficient way to do a search query for a set of objects, based on a M2M field. My search form is going to look something like Blue Cross Blue Shield's | eg: this image
Now, let's suppose my model looks like this:
# models.py
class Provider(models.Model)
title = models.CharField(max_length=150)
phone = PhoneNumberField()
services_offered = models.ManyToManyField(ServiceType)
def __unicode__(self):
return self.title
class ServiceCategory(models.Model):
service_category = models.CharField(max_length=30)
def __unicode__(self):
return self.service_category
class Meta(object):
verbose_name_plural = "Service Categories"
class ServiceType(models.Model):
service_type = models.CharField(max_length=30)
service_category = models.ForeignKey(ServiceCategory)
def __unicode__(self):
return u'%s | %s' % (self.service_category, self.service_type
Also, we have to keep in mind that the options that we select are subject to change, since how they display on the form is dynamic (new ServiceCategories and ServiceTypes can be added at anytime). *How should I go about constructing a query for the Provider objects, given that a person using the search form can select multiple Services_Offered?*
This is currently my HIGHLY INEFFICIENT METHOD:
#managers.py
from health.providers.models import *
from django.db.models import Q
class Query:
def __init__(self):
self.provider_objects=Provider.objects.all()
self.provider_object=Provider.objects
self.service_object=ServiceType.objects
self.category_objects=ServiceCategory.objects.all()
def simple_search_Q(self, **kwargs): #matt's learning note: **kwargs passes any dictionary
return self.provider_objects.filter(
Q(services_offered__service_type__icontains=kwargs['service']),
Q(title__icontains=kwargs['title']),
Q(state=kwargs['state']),
).distinct().order_by('title')
====================
#views.py
from django.shortcuts import render_to_response
from health.providers.models import *
from health.search.forms import *
from health.search.managers import Query #location of the query sets
from django.core.paginator import Paginator, InvalidPage, EmptyPage
from django.template import RequestContext
def simple_search(request):
if request.method == 'POST':
SimpleSearch_form = SimpleSearch(request.POST)
if SimpleSearch_form.is_valid():
request.session["provider_list"] = None
kwargs = {'title': request.POST['title'],
'service': request.POST['service'], 'state': request.POST['state'] }
provider_list = Query().simple_search_Q(**kwargs)
return pagination_results(request, provider_list)
else:
SimpleSearch_form = SimpleSearch()
return render_to_response('../templates/index.html', { 'SimpleSearch_form': SimpleSearch_form},
context_instance=RequestContext(request))
How can I make my query:
Obtain Provider objects based on selecting multiple request.POST['service']
More efficient
Thanks for any help in advanced.
Best Regards,
Matt
1: for multiple request.POST['service'], I assume you mean these are CheckBoxes.
I'd make the CheckBox values ID's, not names, and do a PK lookup.
'services_offered__pk__in': request.POST.getlist('service')
That would return all Provider objects that have ALL of the services selected.
PS: You are also using CapitalCase for instances which is very confusing. If you want your code to be readable, I highly recommend some changes to your style (don't use CapitalCase for instances or variables) and make your variables more descriptive.
SimpleSearch_form = SimpleSearch() # what is SimpleSearch?
simplesearch_form = SimpleSearchForm() # now, it's very clear what the class SimpleSearchForm is
# and the form instance is clearly a for instance.
2: making it more efficient? You could get rid of a lot of code and code separation by remove your whole Query class. Also, I don't know why you are using Q objects since you are not doing anything that would require it (like OR or OR + AND).
def simple_search(request):
if request.method == 'POST':
searchform = SimpleSearchForm(request.POST)
if searchform.is_valid():
request.session['provider_list'] = None
post = request.POST
providers = Provider.objects.filter(services_offered__pk__in=post.getlist('services'),
title=post['title'], state=post['state'])
return pagination_results(request, provider_list)
else:
searchform = SimpleSearchForm()
return direct_to_template(request, '../templates/index.html', { 'searchform': searchform})