I am trying to integrate a whoosh searcher into a django project. I saw that you can do that using haystack but I am realizing I can't (dont know yet) how to add my custom whoosh index into the searcher. My schema has ID, KEYWORD and TEXT but they are all text in reality. I used these schemes because it suits my search needs for each of the documents. How do I use this schema in Haystack.
PS: A solution without Haystack is ok too.
Here is my whoosh schema/writer/searcher
import pandas as pd
from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser
from whoosh.query import *
def nan2none(x):
y = None if pd.isna(x) else x
return(y)
df = pd.read_csv("df.csv", index_col=[0])
schema = Schema(a = ID(stored=True),
b = KEYWORD(lowercase=True),
c = TEXT,
d = KEYWORD(lowercase=True))
ix = create_in("indexdir", schema)
writer = ix.writer()
for index, row in df.iterrows():
writer.add_document(a = index,
b = nan2none(row['b']),
c = nan2none(row['c']),
d = nan2none(row['d']))
writer.commit()
search_term = "hobbit"
with ix.searcher() as searcher:
a_query = QueryParser("a", ix.schema).parse(search_term)
b_query = QueryParser("b", ix.schema).parse(search_term)
c_query = QueryParser("b", ix.schema).parse(search_term)
d_var_query = QueryParser("d", ix.schema, termclass=Variations).parse(search_term)
d_fuzz_query = QueryParser("d", ix.schema, termclass=FuzzyTerm).parse(search_term)
query = Or([a_query, b_query, c_query, d_var_query, d_fuzz_query])
results = searcher.search(query, limit=None)
print(results)
for res in results:
print(res)
But in my django model all the documents I am adding above are CharField as follows:
class ModelLetters(modes.model):
a = models.CharField(max_length=50)
b = models.CharField(max_length=100)
c = models.CharField(max_length=100)
d = models.CharField(max_length=250)
Whereas my haystack index is as follows (all CharField too):
from haystack import indexes
from appmanager.model.model_letters import ModelLetters
class LettersIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
a = indexes.CharField(model_attr="a")
b = indexes.CharField(model_attr="b")
c = indexes.CharField(model_attr="c")
d = indexes.CharField(model_attr="d")
class Meta:
model = ModelLetters
fields = ["a", "b", "c", "d"]
def get_model(self):
return ModelLetters
def index_queryset(self, using=None):
"""Used when the entire index for model is updated."""
return self.get_model().objects.all()
Related
Sorry if the question is really newbie, just learn programming
i put this in my models.py
from django.db import models
from . import func
class Materials(models.Model):
typeId = models.IntegerField()
typeName = models.CharField(max_length=250)
price = models.FloatField()
updated = models.DateTimeField(auto_now=True)
class Meta:
ordering = ('-typeId')
def __str__(Self):
return self.typeId
def insert_data_once():
rows = func.material_id()
for row in rows:
data = Materials(typeId = row[0], typeName = row[1], price = func.sell_min(row[0]))
data.save()
insert_data_once()
here is func.py
import requests
from xml.etree import ElementTree
import sqlite3
def material_id():
conn = sqlite3.connect('eve.db')
command = 'SELECT DISTINCT invTypeMaterials.materialTypeID, invTypes.typeName FROM invTypeMaterials ' \
'INNER JOIN invTypes ON invTypeMaterials.materialTypeID = invTypes.typeID ' \
'WHERE invTypes.Published = 1'
result = conn.execute(command)
rows = result.fetchall()
return rows
def sell_min(type_id):
URL = 'https://api.evemarketer.com/ec/marketstat?typeid=' + str(
type_id) + '®ionlimit=10000002&usesystem=30000142'
minerals_price = requests.get(URL)
root = ElementTree.fromstring(minerals_price.content)
for child in root[0][0][1].iter('min'):
sell_min = child.text
return float(sell_min)
where i should run the insert_data_once function in models.py, the fuction keep looping and cant run manage.py runserver
thank you
so I have this model set up with django and mongoengine.
class Product(Document):
product_id = IntField()
title = StringField(max_length=255)
sources = ListField(ReferenceField(Source, dbref = True))
class Source(Document):
source_id = IntField()
source_type = StringField(choices=settings.PARENT_TYPE_CHOICES, max_length=50)
name = StringField(max_length=255)
url = URLField(max_length=2000)
meta = {"allow_inheritance": True}
And in my scrapy pipeline I save the following data:
class SaveItemPipeline(object):
def process_item(self, item, spider):
product = item["product"]
product["sources"] = self.create_sources(product)
saved_product,created = Product.objects.get_or_create(**product)
return item
def create_sources(self,product):
temp_sources = []
for source in product["sources"]:
print source
if source["source_type"] == "user":
temp_source,created = UserSource.objects.get_or_create(**source)
elif source["source_type"] == "store":
temp_source,created = StoreSource.objects.get_or_create(**source)
elif source["source_type"] == "collection":
temp_source,created = CollectionSource.objects.get_or_create(**source)
temp_sources.append(temp_source.id)
return temp_sources
Howerver, when I run the scraper, on save it gives me this error:
raise ValidationError(message, errors=errors, field_name=field_name)
mongoengine.errors.ValidationError:
[ObjectId('55787a07516ddcf4d93cd4c6'),
ObjectId('55787b07516ddcf5aff06fa9'),
ObjectId('55787b07516ddcf5aff06faa')] is not a valid ObjectId
By the way the UserSource and StoreSource...all inherit from Source so they are just subclasses.However, am I doing anything wrong here, I don't understand why it is giving me that error when product gets created.
Thanks!
You can use this
class Source(Document):
source_id = IntField()
class Product(Document):
sources = ListField(ReferenceField(Source, dbref = True))
src, created = Source.objects.create(source_id=1)
pd, _ = Product.objects.create(sources=[src])
It works for me. I am using mongoengine 0.8.7, pymongo 2.8
I'm trying to use Django's ModelForm and inline forms in my templates. However, I cannot find any documentation that maps neatly to a database model with multiple foreign keys back to the same table. These are my models:
# models.py
class Universities(models.Model):
name = models.CharField(max_length=100)
class Majors(models.Model):
name = models.CharField(max_length=80)
class Resumes(models.Model):
name = models.CharField(max_length=70)
undergrad = models.ForeignKey(Universities, related_name='undergrad_university')
undergrad_major = models.ForeignKey(Majors, related_name='undergrad_major')
grad = models.ForeignKey(Universities, related_name='grad_university')
grad_major = models.ForeignKey(Majors, related_name='grad_major')
How can I have Django generate a form for submitting Resumes where users can type in their university name and major? All four of which would be used to create new entries in their respective databases (2 in Universities, 2 in Majors) before saving the new resume similar to how the inline formset example works for a singular foreign key.
EDIT2 : For making a form. I guess I'd have done a personalized form with overriding of save() method, something like this (forms.py):
class YourForm(forms.Form):
fname = forms.CharField(label="name",max_length=70,validators=[#Choose your validators here])
fundergrad = forms.CharField(label="fundergrad",max_length=100,validators=[#Choose your validators here])
fundergrad_major = forms.CharField(label="fundergrad_major",max_length=80,validators=[#Choose your validators here])
fgrad = forms.CharField(label="fgrad",max_length=100,validators=[#Choose your validators here])
fgrad_major = forms.CharField(label="fgrad_major",max_length=80,validators=[#Choose your validators here])
def save(self, datas):
res = Resumes()
res.name = datas['fname']
undergrad = Universities()
undergrad_major = Majors()
grad = Universities()
grad_major = Majors()
undergrad.name = datas['fundergrad']
undegrad_major.name = datas['fundergrad_major']
grad.name = datas['fgrad']
grad_major.name = datas['fgrad_major']
undergrad.save()
undergrad_major.save()
grad.save()
grad_major.save()
res.undergrad = undergrad
res.undergrad_major = undergrad_major
res.grad = grad
res.grad_major = grad_major
res.save()
return res
In views.py :
def formView(request) :
if request.method == 'POST':
form = YourForm(request.POST)
if form.is_valid():
datas={}
datas['fundergrad']=form.cleaned_data['fundergrad']
datas['fundergrad_major']=form.cleaned_data['fundergrad_major']
datas['fgrad']=form.cleaned_data['fgrad']
datas['fgrad_major']=form.cleaned_data['fgrad_major']
form.save(datas)
#Then do what you have to do in your view
EDIT1 : (doesn't answer the question, but it could help someone maybe so I let it here)
I would have tried with something like this in admin.py:
class UniversitiesInline1(admin.StackedInline):
model = Universities
fk_name = "undergrad"
class UniversitiesInline2(admin.StackedInline):
model = Universities
fk_name = "grad"
class MajorsInline1(admin.StackedInline):
model = Majors
fk_name = "undergrad_major"
class MajorsInline2(admin.StackedInline):
model = Majors
fk_name = "grad_major"
class ResumesAdmin(admin.ModelAdmin)
inlines = [
UniversitiesInline1,
UniversitiesInline2,
MajorsInline1,
MajorsInline2,
]
admin.site.register(Resumes, ResumesAdmin)
Explanations : https://docs.djangoproject.com/en/dev/ref/contrib/admin/#working-with-a-model-with-two-or-more-foreign-keys-to-the-same-parent-model
I am using Scrapy for a project, in this project I am extracting the information from the xml.
In the xml document the format where I would like to implement the for loop:
<relatedPersonsList>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>
<relatedPersonName>
<firstName>Mark</firstName>
<middleName>E.</middleName>
<lastName>Lucas</lastName>
</relatedPersonName>
<relatedPersonAddress>
<street1>1 IMATION WAY</street1>
<city>OAKDALE</city>
<stateOrCountry>MN</stateOrCountry>
<stateOrCountryDescription>MINNESOTA</stateOrCountryDescription>
<zipCode>55128</zipCode>
</relatedPersonAddress>
<relatedPersonRelationshipList>
<relationship>Executive Officer</relationship>
<relationship>Director</relationship>
</relatedPersonRelationshipList>
<relationshipClarification/>
</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
<relatedPersonInfo>...</relatedPersonInfo>
</relatedPersonsList>
As you can see in the <relatedPersonsList>, you can have multiple <relatedPersonInfo>, and when I try to make a for loop, I still only get the information of the first person.
This is my actual code:
for person in xxs.select('./relatedPersonsList/relatedPersonInfo'):
item = Myform() #even if get rid of it I get the same result
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
here is the code that I used on my spider:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
import urlparse
from formds.items import SecformD
class SecDform(CrawlSpider):
name = "DFORM"
allowed_domain = ["http://www..gov"]
start_urls = [
""
]
rules = (
Rule(
SgmlLinkExtractor(restrict_xpaths=["/html/body/div/table/tr/td[3]/a[2]"]),
callback='parse_formd',
#follow= True no need of follow thing
),
Rule(
SgmlLinkExtractor(restrict_xpaths=('/html/body/div/center[1]/a[contains(., "[NEXT]")]')),
follow=True
),
)
def parse_formd(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[#id="formDiv"]/div/table/tr[3]/td[3]/a/#href').extract()
for site in sites:
yield Request(url=urlparse.urljoin(response.url, site), callback=self.parse_xml_document)
def parse_xml_document(self, response):
xxs = XmlXPathSelector(response)
item = SecformD()
item["stateOrCountryDescription"] = xxs.select('./primaryIssuer/issuerAddress/stateOrCountryDescription/text()').extract()[0]
item["zipCode"] = xxs.select('./primaryIssuer/issuerAddress/zipCode/text()').extract()[0]
item["issuerPhoneNumber"] = xxs.select('./primaryIssuer/issuerPhoneNumber/text()').extract()[0]
for person in xxs.select('./relatedPersonsList//relatedPersonInfo'):
#item = SecDform()
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
return item
I extract the information to a .json file using this command:
scrapy crawl DFORM -o tes4.json -t json
Try something like this:
def parse_xml_document(self, response):
xxs = XmlXPathSelector(response)
items = []
# common field values
stateOrCountryDescription = xxs.select('./primaryIssuer/issuerAddress/stateOrCountryDescription/text()').extract()[0]
zipCode = xxs.select('./primaryIssuer/issuerAddress/zipCode/text()').extract()[0]
issuerPhoneNumber = xxs.select('./primaryIssuer/issuerPhoneNumber/text()').extract()[0]
for person in xxs.select('./relatedPersonsList//relatedPersonInfo'):
# instantiate one item per loop iteration
item = SecformD()
# save common parameters
item["stateOrCountryDescription"] = stateOrCountryDescription
item["zipCode"] = zipCode
item["issuerPhoneNumber"] = issuerPhoneNumber
item["firstName"] = person.select('./relatedPersonName/firstName/text()').extract()[0]
item["middleName"] = person.select('./relatedPersonName/middleName/text()')
if item["middleName"]:
item["middleName"] = item["middleName"].extract()[0]
else:
item["middleName"] = "NA"
items.append(item)
return items
think about these:
here is a function,
def calculate(model):
model.tempfield = 1
and this function will save a temp field in this model
and you can use model.tempfield in everywhere
but if it's a queryset,after an order_by these temp filed will lost
how to order_by these temp field in queryset?
i have 2 model:
Class A(models.Model):
name = models.CharField(maxlength=100)
Class Log_Of_A(models.Model):
clicks = models.IntegerField()
a = models.ForeignKey(A)
date = models.DateField(db_index=True)
and calculate the Log of A by date
def createlog(request):
start = request.GET.get("start")
end = request.GET.get("end")
all_A = A.objects.all()
for a in all_A:
logs=Log_Of_A.objects.filter(a=a,date__gt=start,date__lt=end)
statistics = logs.aggregate(Sum("clicks"))
a.clicks = statistics["clicks__sum"]
all_A.order_by("clicks")
return all_A
how to order_by temporary field
Try this:
import operator
def createlog(request):
start = request.GET.get("start")
end = request.GET.get("end")
all_A = A.objects.all()
for a in all_A:
logs=Log_Of_A.objects.filter(a=a,date__gt=start,date__lt=end)
statistics = logs.aggregate(Sum("clicks"))
a.clicks = statistics["clicks__sum"]
sorted = sorted(all_A, key=operator.attrgetter('clicks'), reverse=True)
return sorted