Search for a mix of numbers and chars with haystack (elasticsearch) - django

I am using Django Haystack with Elasticsearch. I have a string field called 'code' in this type of format:
76-010
I would like to be able to search
76-
And get as a result
76-111
76-110
76-210
...
and so on.
but I don't want to get these results:
11-760
11-076
...
I already have a custom elastic search backend but I am not sure how should i indexing it to get the desired behavior.
class ConfigurableElasticBackend(ElasticsearchSearchBackend):
def __init__(self, connection_alias, **connection_options):
# see http://stackoverflow.com/questions/13636419/elasticsearch-edgengrams-and-numbers
self.DEFAULT_SETTINGS['settings']['analysis']['analyzer']['edgengram_analyzer']['tokenizer'] = 'standard'
self.DEFAULT_SETTINGS['settings']['analysis']['analyzer']['edgengram_analyzer']['filter'].append('lowercase')
super(ConfigurableElasticBackend, self).__init__(connection_alias, **connection_options)

The idea is to use an edgeNGram tokenizer in order to index every prefix of your code field. For instance, we would like 76-111 to be indexed as 7, 76, 76-, 76-1, 76-11 and 76-111. That way you will find 766-11 by searching for any of its prefixes.
Note that this article provides a full-fledge solution to your problem. The index settings for your case would look like this in Django code. You can then follow that article to wrap it up, but this should get you started.
class ConfigurableElasticBackend(ElasticsearchSearchBackend):
DEFAULT_SETTINGS = {
"settings": {
"analysis": {
"analyzer": {
"edgengram_analyzer": {
"tokenizer": "edgengram_tokenizer",
"filter": [ "lowercase" ]
}
},
"tokenizer": {
"edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "25"
}
}
}
},
"mappings": {
"your_type": {
"properties": {
"code": {
"type": "string",
"analyzer": "edgengram_analyzer"
}
}
}
}
}
def __init__(self, connection_alias, **connection_options):
super(ConfigurableElasticBackend, self).__init__(connection_alias, **connection_options)
self.conn = pyelasticsearch.ElasticSearch(connection_options['URL'], timeout=self.timeout)
self.index_name = connection_options['INDEX_NAME']
# create the index with the above settings
self.conn.create_index(self.index_name, self.DEFAULT_SETTINGS)

Related

How can I get distinct values for the area.names using graphene?

my resolver in schema.py looks like this
def resolve_areas(self, info, **kwargs):
result = []
dupfree = []
user = info.context.user
areas = BoxModel.objects.filter(client=user, active=True).values_list('area_string', flat=True)
In GraphiQL I am using this query:
{
areas {
edges {
node {
id
name
}
}
}
}
And get Output that starts like this:
{
"data": {
"areas": {
"edges": [
{
"node": {
"id": "QXJlYTpkZWZ",
"name": "default"
}
},
{
"node": {
"id": "QXJlYTptZXN",
"name": "messe"
}
},
{
"node": {
"id": "QXJlYTptZXN",
"name": "messe"
}
},
But i want distinct values on the name variable
(Using a MySQL Database so distinct does not work)
SOLVED:
distinct was not working. so i just wrote a short loop which tracked onlye the string names duplicates in a list and only appended the whole "area" object if it's name has not been added to the duplicates list yet
result = []
dupl_counter = []
for area in areas:
if area not in dupl_counter:
dupl_counter.append(area)
result.append(Area(name=area))
print(area)

Flatten json return by DRF

I have json API returned as below format.
But I want to return json API decomposing namingzone key as specified below.
Could anyone tell me how I can revise serializer to achieve this?
serializer.py is also specified below.
For models.py and views.py, please refer to my previous post.
current
{
"zone": {
"zone": "office_enclosed",
"namingzone": [
{
"naming": "moffice"
}
]
},
"lpd": 11.9,
"sensor": true
},
{
"zone": {
"zone": "office_open",
"namingzone": [
{
"naming": "off"
},
{
"naming": "office"
}
]
},
"lpd": 10.5,
"sensor": true
}
Target
{
"zone": "office_enclosed",
"naming": "moffice",
"lpd": 11.9,
"sensor": true
},
{
"zone": "office_open",
"naming": "off",
"lpd": 10.5,
"sensor": true
},
{
"zone": "office_open",
"naming": "office",
"lpd": 10.5,
"sensor": true
}
serializer.py
class namingNewSerializer(serializers.ModelSerializer):
class Meta:
model=Naming
fields=('naming',)
class zoneSerializer(serializers.ModelSerializer):
namingzone=namingNewSerializer(many=True)
class Meta:
model=Zone
fields = ('zone','namingzone')
class lightSerializer(serializers.ModelSerializer):
zone = zoneSerializer()
class Meta:
model=Light
fields = ('zone','lpd','sensor')
class namingSerializer(serializers.ModelSerializer):
zone=zoneSerializer()
class Meta:
model=Naming
fields=('zone','naming')
I would say using Serializer might complicate the implementations. Rather, you can take an pythonic approach. Try like this:
class SomeView(APIView):
...
def get(self, request, *args, **kwargs):
data = lightSerializer(Light.objects.all(), many=True).data
data = list(data) # convert lazy object to list
updated_data = list()
for item in data:
newdict = dict()
zone = item['zone']
newdict.update({'zone':zone['zone'], 'lpd': item['lpd'], 'sensor':item['sensor']})
for naming_zone in zone.get('namingzone'):
naming_zone.update(newDict)
updated_data.append(naming_zone)
return Response(updated_data, status=status.HTTP_200_OK)
See DRF Field document about source. It will help you.
https://www.django-rest-framework.org/api-guide/fields/#source

How can i get Alexa Slot Value in ASK-SDK lambda function?

I want to access the slot value '{cityName}' in my Lambda Function. I am using ASK-SDK. What is the python code or syntax to do so?
If the WeatherApiCallIntent is triggered with a CityName slot value, the request JSON will look like this:
"request": {
"type": "IntentRequest",
"requestId": "amzn1.echo-api.request.xxxxx-xxx-xxx-xx-xxxxxxx",
"timestamp": "2018-09-12T13:35:25Z",
"locale": "en-US",
"intent": {
"name": "WeatherApiCallIntent",
"confirmationStatus": "NONE",
"slots": {
"CityName ": {
"name": "CityName",
"value": "Kochi",
...
In you WeatherApiCallIntent handler you can use
print("Slot: " + intent_request['intent']['slots']['CityName']['value'])
The above method is correct but it did not work for me.
Another answer to the above question is as follows:
item, is_resolved = util.get_intent_(slots=handler_input.request_envelope.request.intent.slots)
and then define function get_intent_ as follows:
import random
import six
from ask_sdk_core.handler_input import HandlerInput
from ask_sdk_core.utils import is_request_type
def get_intent_(slots):
item = []
resolved_item = None
for _,slot in six.iteritems(slots):
if slot.value is not None:
resolved_item = slot.value
if resolved_item is not None:
return resolved_item, True
else:
return resolved_item, False
This method looks complicated but it is a good practice to define a function and call it.

Elasticsearch in Django - sort alphabetically

I have a following doc:
#brand.doc_type
class BrandDocument(DocType):
class Meta:
model = Brand
id = IntegerField()
name = StringField(
fields={
'raw': {
'type': 'keyword',
'fielddata': True,
}
},
)
lookup_name = StringField(
fields={
'raw': {
'type': 'string',
}
},
)
and I try to make a lookup using this:
BrandDocument.search().sort({
'name.keyword': order,
})
The problem is that I'm getting results sorted in a case sensitive way, which means that instead of 'a', 'A', 'ab', 'AB' I get 'A', 'AB', 'a', 'ab'. How can this be fixed?
EDIT After some additional search I've come up with something like this:
lowercase_normalizer = normalizer(
'lowercase_normalizer',
filter=['lowercase']
)
lowercase_analyzer = analyzer(
'lowercase_analyzer',
tokenizer="keyword",
filter=['lowercase'],
)
#brand.doc_type
class BrandDocument(DocType):
class Meta:
model = Brand
id = IntegerField()
name = StringField(
analyzer=lowercase_analyzer,
fields={
'raw': Keyword(normalizer=lowercase_normalizer, fielddata=True),
},
)
The issue persists, however, and I can't find in the docs how this normalizer should be used.
I would suggest to create a custom analyzer with lowercase filter and apply it to the field while indexing.
So you have to update the following in the index settings:
{
"index": {
"analysis": {
"analyzer": {
"custom_sort": {
"tokenizer": "keyword",
"filter": [
"lowercase"
]
}
}
}
}
}
Add a field (based on which you need to sort) in mapping with the custom_sort analyzer as below:
{
"properties":{
"sortField":{
"type":"text",
"analyzer":"custom_sort"
}
}
}
If the field already exists in mapping then you can add a sub fields to the existing field with the analyzer as below.
Assuming the field name having type as keyword already exists, update it as:
{
"properties":{
"name":{
"type": "keyword",
"fields":{
"sortval":{
"type":"text",
"analyzer":"custom_sort"
}
}
}
}
}
Once done you need to reindex your data so that lowercase values are indexed. Then you can use the field to sort as:
Case 1 (new field):
"sort": [
{
"sortField": "desc"
}
]
Case 2 (sub field):
"sort": [
{
"name.sortval": "desc"
}
]

Django Haystack - How to force exact attribute match without stemming?

I'm using Django 1.5 with django-haystack 2.0 and an elasticsearch backend. I'm trying to search by an exact attribute match. However, I'm getting "similar" results even though I'm using both the __exact operator and the Exact() class. How can I prevent this behavior?
For example:
# models.py
class Person(models.Model):
name = models.TextField()
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr="name")
def get_model(self):
return Person
def index_queryset(self, using=None):
return self.get_model().objects.all()
# templates/search/indexes/people/person_text.txt
{{ object.name }}
>>> p1 = Person(name="Simon")
>>> p1.save()
>>> p2 = Person(name="Simons")
>>> p2.save()
$ ./manage.py rebuild_index
>>> person_sqs = SearchQuerySet().models(Person)
>>> person_sqs.filter(name__exact="Simons")
[<SearchResult: people.person (name=u'Simon')>
<SearchResult: people.person (name=u'Simons')>]
>>> person_sqs.filter(name=Exact("Simons", clean=True))
[<SearchResult: people.person (name=u'Simon')>
<SearchResult: people.person (name=u'Simons')>]
I only want the search result for "Simons" - the "Simon" result should not show up.
Python3, Django 1.10, Elasticsearch 2.4.4.
TL;DR: define custom tokenizer (not filter)
Verbose explanation
a) use EdgeNgramField:
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.EdgeNgramField(document=True, use_template=True)
...
b) template:
# templates/search/indexes/people/person_text.txt
{{ object.name }}
c) create custom search backend:
# backends.py
from django.conf import settings
from haystack.backends.elasticsearch_backend import (
ElasticsearchSearchBackend,
ElasticsearchSearchEngine,
)
class CustomElasticsearchSearchBackend(ElasticsearchSearchBackend):
def __init__(self, connection_alias, **connection_options):
super(CustomElasticsearchSearchBackend, self).__init__(
connection_alias, **connection_options)
setattr(self, 'DEFAULT_SETTINGS', settings.ELASTICSEARCH_INDEX_SETTINGS)
class CustomElasticsearchSearchEngine(ElasticsearchSearchEngine):
backend = CustomElasticsearchSearchBackend
d) define custom tokenizer (not filter!):
# settings.py
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'apps.persons.backends.CustomElasticsearchSearchEngine',
'URL': 'http://127.0.0.1:9200/',
'INDEX_NAME': 'haystack',
},
}
ELASTICSEARCH_INDEX_SETTINGS = {
"settings": {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "custom_ngram_tokenizer",
"filter": ["asciifolding", "lowercase"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "custom_edgengram_tokenizer",
"filter": ["asciifolding", "lowercase"]
}
},
"tokenizer": {
"custom_ngram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 12,
"token_chars": ["letter", "digit"]
},
"custom_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 2,
"max_gram": 12,
"token_chars": ["letter", "digit"]
}
}
}
}
}
HAYSTACK_DEFAULT_OPERATOR = 'AND'
e) use AutoQuery (more versatile):
# views.py
search_value = 'Simons'
...
person_sqs = \
SearchQuerySet().models(Person).filter(
content=AutoQuery(search_value)
)
f) reindex after changes:
$ ./manage.py rebuild_index
I was facing a similar problem. if you change the settings of your haystacks elasticsearch back end like:
DEFAULT_SETTINGS = {
'settings': {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["haystack_ngram", "lowercase"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["haystack_edgengram", "lowercase"]
}
},
"tokenizer": {
"haystack_ngram_tokenizer": {
"type": "nGram",
"min_gram": 6,
"max_gram": 15,
},
"haystack_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 6,
"max_gram": 15,
"side": "front"
}
},
"filter": {
"haystack_ngram": {
"type": "nGram",
"min_gram": 6,
"max_gram": 15
},
"haystack_edgengram": {
"type": "edgeNGram",
"min_gram": 6,
"max_gram": 15
}
}
}
}
}
Then it will tokenize only when the query is more than 6 character.
If you want results like "xyzsimonsxyz", then you would need to use ngram analyzer instead of EdgeNGram or you could use both depending on your requirements. EdgeNGram generates tokens only from the beginning.
with NGram 'simons' will be one of the generated tokens for term xyzsimonsxyz assuming max_gram >=6 and you will get expected results, also search_analyzer needs to be different or you will get weird results.
Also index size might get pretty big with ngram if you have huge chunk of text
Not use CharField use EdgeNgramField.
# search_indexes.py
class PersonIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
name = indexes.EdgeNgramField(model_attr="name")
def get_model(self):
return Person
def index_queryset(self, using=None):
return self.get_model().objects.all()
And not user filter, user autocomplete
person_sqs = SearchQuerySet().models(Person)
person_sqs.autocomplete(name="Simons")
source: http://django-haystack.readthedocs.org/en/v2.0.0/autocomplete.html