Django Haystack - How to force exact attribute match without stemming? - django

I'm using Django 1.5 with django-haystack 2.0 and an elasticsearch backend. I'm trying to search by an exact attribute match. However, I'm getting "similar" results even though I'm using both the __exact operator and the Exact() class. How can I prevent this behavior?
For example:
# models.py
class Person(models.Model):
name = models.TextField()
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr="name")
def get_model(self):
return Person
def index_queryset(self, using=None):
return self.get_model().objects.all()
# templates/search/indexes/people/person_text.txt
{{ object.name }}
>>> p1 = Person(name="Simon")
>>> p1.save()
>>> p2 = Person(name="Simons")
>>> p2.save()
$ ./manage.py rebuild_index
>>> person_sqs = SearchQuerySet().models(Person)
>>> person_sqs.filter(name__exact="Simons")
[<SearchResult: people.person (name=u'Simon')>
<SearchResult: people.person (name=u'Simons')>]
>>> person_sqs.filter(name=Exact("Simons", clean=True))
[<SearchResult: people.person (name=u'Simon')>
<SearchResult: people.person (name=u'Simons')>]
I only want the search result for "Simons" - the "Simon" result should not show up.

Python3, Django 1.10, Elasticsearch 2.4.4.
TL;DR: define custom tokenizer (not filter)
Verbose explanation
a) use EdgeNgramField:
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.EdgeNgramField(document=True, use_template=True)
...
b) template:
# templates/search/indexes/people/person_text.txt
{{ object.name }}
c) create custom search backend:
# backends.py
from django.conf import settings
from haystack.backends.elasticsearch_backend import (
ElasticsearchSearchBackend,
ElasticsearchSearchEngine,
)
class CustomElasticsearchSearchBackend(ElasticsearchSearchBackend):
def __init__(self, connection_alias, **connection_options):
super(CustomElasticsearchSearchBackend, self).__init__(
connection_alias, **connection_options)
setattr(self, 'DEFAULT_SETTINGS', settings.ELASTICSEARCH_INDEX_SETTINGS)
class CustomElasticsearchSearchEngine(ElasticsearchSearchEngine):
backend = CustomElasticsearchSearchBackend
d) define custom tokenizer (not filter!):
# settings.py
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'apps.persons.backends.CustomElasticsearchSearchEngine',
'URL': 'http://127.0.0.1:9200/',
'INDEX_NAME': 'haystack',
},
}
ELASTICSEARCH_INDEX_SETTINGS = {
"settings": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "custom_ngram_tokenizer",
"filter": ["asciifolding", "lowercase"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "custom_edgengram_tokenizer",
"filter": ["asciifolding", "lowercase"]
}
},
"tokenizer": {
"custom_ngram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 12,
"token_chars": ["letter", "digit"]
},
"custom_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 12,
"token_chars": ["letter", "digit"]
}
}
}
}
}
HAYSTACK_DEFAULT_OPERATOR = 'AND'
e) use AutoQuery (more versatile):
# views.py
search_value = 'Simons'
...
person_sqs = \
SearchQuerySet().models(Person).filter(
content=AutoQuery(search_value)
)
f) reindex after changes:
$ ./manage.py rebuild_index

I was facing a similar problem. if you change the settings of your haystacks elasticsearch back end like:
DEFAULT_SETTINGS = {
'settings': {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["haystack_ngram", "lowercase"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["haystack_edgengram", "lowercase"]
}
},
"tokenizer": {
"haystack_ngram_tokenizer": {
"type": "nGram",
"min_gram": 6,
"max_gram": 15,
},
"haystack_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 6,
"max_gram": 15,
"side": "front"
}
},
"filter": {
"haystack_ngram": {
"type": "nGram",
"min_gram": 6,
"max_gram": 15
},
"haystack_edgengram": {
"type": "edgeNGram",
"min_gram": 6,
"max_gram": 15
}
}
}
}
}
Then it will tokenize only when the query is more than 6 character.
If you want results like "xyzsimonsxyz", then you would need to use ngram analyzer instead of EdgeNGram or you could use both depending on your requirements. EdgeNGram generates tokens only from the beginning.
with NGram 'simons' will be one of the generated tokens for term xyzsimonsxyz assuming max_gram >=6 and you will get expected results, also search_analyzer needs to be different or you will get weird results.
Also index size might get pretty big with ngram if you have huge chunk of text

Not use CharField use EdgeNgramField.
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.EdgeNgramField(model_attr="name")
def get_model(self):
return Person
def index_queryset(self, using=None):
return self.get_model().objects.all()
And not user filter, user autocomplete
person_sqs = SearchQuerySet().models(Person)
person_sqs.autocomplete(name="Simons")
source: http://django-haystack.readthedocs.org/en/v2.0.0/autocomplete.html

Related

Django/Wagtail Rest Framework API Ordering/Pagination

I am using wagtail. I have serialized my API. I want to order them by -first_published_at, when someone hit my API url api/v2/pages they will see an ordered API without filtering it via URL. here is my api.py code:
class ProdPagesAPIViewSet(BaseAPIViewSet):
renderer_classes = [JSONRenderer]
filter_backends = [FieldsFilter,
ChildOfFilter,
AncestorOfFilter,
DescendantOfFilter,
OrderingFilter,
TranslationOfFilter,
LocaleFilter,
SearchFilter,]
meta_fields = ["type","seo_title","search_description","first_published_at"]
body_fields = ["id","type","seo_title","search_description","first_published_at","title"]
listing_default_fields = ["type","seo_title","search_description","first_published_at","id","title","alternative_title","news_slug","blog_image","video_thumbnail","categories","blog_authors","excerpt","content","content2","tags",]
nested_default_fields = []
ordered_queryset= []
name = "pages"
model = AddStory
api_router.register_endpoint("pages", ProdPagesAPIViewSet)
I have tried ordered_queryset= [AddStory.objects.order_by('-first_published_at')]
But it's not ordered by the newest published stories. How should I do the query?
Here is my API response
{
"meta": {
"total_count": 6
},
"items": [
{
"id": 4,
"meta": {
"type": "blog.AddStory",
"seo_title": "",
"search_description": "",
"first_published_at": "2022-08-30T11:05:11.341355Z"
},
{
"id": 6,
"meta": {
"type": "blog.AddStory",
"seo_title": "",
"search_description": "",
"first_published_at": "2022-08-30T11:13:47.114889Z"
},
{
"id": 7,
"meta": {
"type": "blog.AddStory",
"seo_title": "",
"search_description": "",
"first_published_at": "2022-08-31T11:13:47.114889Z"
},
Solved after using get_queryset instead of order_queryset
#api.py
class ProdPagesAPIViewSet(BaseAPIViewSet):
renderer_classes = [JSONRenderer]
filter_backends = [FieldsFilter,
ChildOfFilter,
AncestorOfFilter,
DescendantOfFilter,
OrderingFilter,
TranslationOfFilter,
LocaleFilter,
SearchFilter,]
meta_fields = ["type","seo_title","search_description","first_published_at"]
body_fields = ["id","type","seo_title","search_description","first_published_at","title"]
listing_default_fields = ["type","seo_title","search_description","first_published_at","id","title","alternative_title","news_slug","blog_image","video_thumbnail","categories","blog_authors","excerpt","content","content2","tags",]
nested_default_fields = []
def get_queryset(self):
return self.model.objects.all().order_by("-first_published_at")
name = "pages"
model = AddStory
api_router.register_endpoint("pages", ProdPagesAPIViewSet)

Flatten json return by DRF

I have json API returned as below format.
But I want to return json API decomposing namingzone key as specified below.
Could anyone tell me how I can revise serializer to achieve this?
serializer.py is also specified below.
For models.py and views.py, please refer to my previous post.
current
{
"zone": {
"zone": "office_enclosed",
"namingzone": [
{
"naming": "moffice"
}
]
},
"lpd": 11.9,
"sensor": true
},
{
"zone": {
"zone": "office_open",
"namingzone": [
{
"naming": "off"
},
{
"naming": "office"
}
]
},
"lpd": 10.5,
"sensor": true
}
Target
{
"zone": "office_enclosed",
"naming": "moffice",
"lpd": 11.9,
"sensor": true
},
{
"zone": "office_open",
"naming": "off",
"lpd": 10.5,
"sensor": true
},
{
"zone": "office_open",
"naming": "office",
"lpd": 10.5,
"sensor": true
}
serializer.py
class namingNewSerializer(serializers.ModelSerializer):
class Meta:
model=Naming
fields=('naming',)
class zoneSerializer(serializers.ModelSerializer):
namingzone=namingNewSerializer(many=True)
class Meta:
model=Zone
fields = ('zone','namingzone')
class lightSerializer(serializers.ModelSerializer):
zone = zoneSerializer()
class Meta:
model=Light
fields = ('zone','lpd','sensor')
class namingSerializer(serializers.ModelSerializer):
zone=zoneSerializer()
class Meta:
model=Naming
fields=('zone','naming')
I would say using Serializer might complicate the implementations. Rather, you can take an pythonic approach. Try like this:
class SomeView(APIView):
...
def get(self, request, *args, **kwargs):
data = lightSerializer(Light.objects.all(), many=True).data
data = list(data) # convert lazy object to list
updated_data = list()
for item in data:
newdict = dict()
zone = item['zone']
newdict.update({'zone':zone['zone'], 'lpd': item['lpd'], 'sensor':item['sensor']})
for naming_zone in zone.get('namingzone'):
naming_zone.update(newDict)
updated_data.append(naming_zone)
return Response(updated_data, status=status.HTTP_200_OK)
See DRF Field document about source. It will help you.
https://www.django-rest-framework.org/api-guide/fields/#source

Elasticsearch in Django - sort alphabetically

I have a following doc:
#brand.doc_type
class BrandDocument(DocType):
class Meta:
model = Brand
id = IntegerField()
name = StringField(
fields={
'raw': {
'type': 'keyword',
'fielddata': True,
}
},
)
lookup_name = StringField(
fields={
'raw': {
'type': 'string',
}
},
)
and I try to make a lookup using this:
BrandDocument.search().sort({
'name.keyword': order,
})
The problem is that I'm getting results sorted in a case sensitive way, which means that instead of 'a', 'A', 'ab', 'AB' I get 'A', 'AB', 'a', 'ab'. How can this be fixed?
EDIT After some additional search I've come up with something like this:
lowercase_normalizer = normalizer(
'lowercase_normalizer',
filter=['lowercase']
)
lowercase_analyzer = analyzer(
'lowercase_analyzer',
tokenizer="keyword",
filter=['lowercase'],
)
#brand.doc_type
class BrandDocument(DocType):
class Meta:
model = Brand
id = IntegerField()
name = StringField(
analyzer=lowercase_analyzer,
fields={
'raw': Keyword(normalizer=lowercase_normalizer, fielddata=True),
},
)
The issue persists, however, and I can't find in the docs how this normalizer should be used.
I would suggest to create a custom analyzer with lowercase filter and apply it to the field while indexing.
So you have to update the following in the index settings:
{
"index": {
"analysis": {
"analyzer": {
"custom_sort": {
"tokenizer": "keyword",
"filter": [
"lowercase"
]
}
}
}
}
}
Add a field (based on which you need to sort) in mapping with the custom_sort analyzer as below:
{
"properties":{
"sortField":{
"type":"text",
"analyzer":"custom_sort"
}
}
}
If the field already exists in mapping then you can add a sub fields to the existing field with the analyzer as below.
Assuming the field name having type as keyword already exists, update it as:
{
"properties":{
"name":{
"type": "keyword",
"fields":{
"sortval":{
"type":"text",
"analyzer":"custom_sort"
}
}
}
}
}
Once done you need to reindex your data so that lowercase values are indexed. Then you can use the field to sort as:
Case 1 (new field):
"sort": [
{
"sortField": "desc"
}
]
Case 2 (sub field):
"sort": [
{
"name.sortval": "desc"
}
]

Django - Multiple data values in fusion pie charts

I want to add multiple data values in my pie charts taking model fields. currently i am doing this:
def detail(request, pk):
if not request.user.is_authenticated():
return render(request, 'registration/login.html')
else:
bowler = Bowlers.objects.get(pk=pk)
select_list = UserSelect.objects.filter(user=request.user, bowler=pk)
all_select = UserSelect.objects.filter(user=request.user).count()
team = Team.objects.filter(user=request.user)
dataSource = {}
dataSource['chart'] = {
"caption": "last year",
"subCaption": "Harry's SuperMart",
"xAxisName": "Month",
"yAxisName": "Revenues (In USD)",
"numberPrefix": "$",
"theme": "zune",
"type": "doughnut2d"
}
dataSource['data'] = []
# Iterate through the data in `Revenue` model and insert in to the `dataSource['data']` list.
data = {}
data['label'] = bowler.name
data['value'] = bowler.ave
dataSource['data'].append(data)
column2D = FusionCharts("doughnut2d", "ex1", "600", "350", "chart-1", "json", dataSource)
context = {
'bowler': bowler,
'select_list': select_list,
'all_select': all_select,
'team': team,
'output': column2D.render()
}
return render(request, 'bowler_detail.html', context)
and i want to add more values something like this: (this is JSON format of adding multiple data values)
"data": [
{
"label": "Venezuela",
"value": "290"
},
{
"label": "Saudi",
"value": "260"
},
]
Any guide how will I add more values fetching from django models in my chart?

Search for a mix of numbers and chars with haystack (elasticsearch)

I am using Django Haystack with Elasticsearch. I have a string field called 'code' in this type of format:
76-010
I would like to be able to search
76-
And get as a result
76-111
76-110
76-210
...
and so on.
but I don't want to get these results:
11-760
11-076
...
I already have a custom elastic search backend but I am not sure how should i indexing it to get the desired behavior.
class ConfigurableElasticBackend(ElasticsearchSearchBackend):
def __init__(self, connection_alias, **connection_options):
# see http://stackoverflow.com/questions/13636419/elasticsearch-edgengrams-and-numbers
self.DEFAULT_SETTINGS['settings']['analysis']['analyzer']['edgengram_analyzer']['tokenizer'] = 'standard'
self.DEFAULT_SETTINGS['settings']['analysis']['analyzer']['edgengram_analyzer']['filter'].append('lowercase')
super(ConfigurableElasticBackend, self).__init__(connection_alias, **connection_options)
The idea is to use an edgeNGram tokenizer in order to index every prefix of your code field. For instance, we would like 76-111 to be indexed as 7, 76, 76-, 76-1, 76-11 and 76-111. That way you will find 766-11 by searching for any of its prefixes.
Note that this article provides a full-fledge solution to your problem. The index settings for your case would look like this in Django code. You can then follow that article to wrap it up, but this should get you started.
class ConfigurableElasticBackend(ElasticsearchSearchBackend):
DEFAULT_SETTINGS = {
"settings": {
"analysis": {
"analyzer": {
"edgengram_analyzer": {
"tokenizer": "edgengram_tokenizer",
"filter": [ "lowercase" ]
}
},
"tokenizer": {
"edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "25"
}
}
}
},
"mappings": {
"your_type": {
"properties": {
"code": {
"type": "string",
"analyzer": "edgengram_analyzer"
}
}
}
}
}
def __init__(self, connection_alias, **connection_options):
super(ConfigurableElasticBackend, self).__init__(connection_alias, **connection_options)
self.conn = pyelasticsearch.ElasticSearch(connection_options['URL'], timeout=self.timeout)
self.index_name = connection_options['INDEX_NAME']
# create the index with the above settings
self.conn.create_index(self.index_name, self.DEFAULT_SETTINGS)