I want to upload a xml file into a PostgreSQL using Django - django

I am new to Django and my current task is to upload a xml file with 16 fields and more than 60000 rows to a database in PostgreSQL. I used Django to connect to the Database and was able to create a table in the database.
I also used XML Etree to parse the xml file. I am having trouble storing the data in the table that I created in the sql database.
This is the code that I used to parse:
import xml.etree.ElementTree as ET
def saveXML2db():
my_file = "C:/Users/Adithyas/myproject/scripts/supplier_lookup.xml"
tree = ET.parse(my_file)
root = tree.getroot()
cols = ["organization", "code", "name"]
rows = []
for i in root:
organization = i.find("organization").text
code = i.find("code").text
name = i.find("name").text
x = rows.append([organization, code, name])
data = """INSERT INTO records(organization,code,name) VALUES(%s,%s,%s)"""
x.save()
saveXML2db()
the code runs without any error, but I am unable to store the data into the table in the SQL database.

So I figured out the answer to my question and I wish to share this with you guys.
This is how I imported a xml file to PostgreSQL database using Django ORM:
First, I created a virtual environment to work with:
open command prompt in the folder you wish to run the project
py -m venv envy
envy\Scripts\activate
our virtual environment is ready to use
then,
pip install django
pip install psycopg2
django-admin startproject projectq
cd projectq
py manage.py startapp myapp
now both our project and app is created and ready to use
code . #to open Visual code
now go to settings.py in 'projectq' and add 'myapp' to INSTALLED_APPS:
INSTALLED_APPS = [
'myapp',#add myapp to the installed apps
]
now to connect our project to PostgreSQL database we have to make some changes in the DATABASES in settings.py as well:
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql_psycopg2',
'NAME': 'projectq',
'USER': 'postgres',
'PASSWORD': '1234',
}
}
change dbsqlite to the name of the database that you are using, add name of your Database, username and password
now the connection is established.
we move on to the next step
go to models.py to create our table in PostgreSQL to store our xml data:
from django.db import models
# Create your models here.
class Record(models.Model):
po_organization = models.IntegerField()
code = models.CharField(max_length = 100)
name = models.CharField(max_length=100)
address_1 = models.CharField(max_length=100 , null = True)
address_2 = models.CharField(max_length=100, null = True)
If your data has null values it's best to add null = True, to avoid errors
py manage.py makemigrations
py manage.py migrate
now the table we created should appear on the PostgreSQL database
next step is to parse our xml file and to import it to the table we created.
For that we will use Django ORM queries
open terminal in our visual code in models.py
activate virtual environment again
to use ORM query:
py manage.py shell
now add these codes to the interactive console:
>>>from myapp.models import Record
>>>import xml.etree.ElementTree as ET
>>>def data2db():
...file_dir = 'supplier_lookup.xml'
...data = ET.parse(file_dir)
...root = data.findall('record')
...for i in root:
... organization = i.find('organization').text
... code = i.find('code').text
... name = i.find('name').text
... address_1 = i.find('address_1').text
... address_2 = i.find('address_2').text
... x = Record.objects.create(organization=organization, code=code,
... name=name, address_1=address_1, address_2=address_2)
... x.save()
...
>>>data2db()
That's It. The data should be loaded into the database now.
Hope this helps.

Have you checked any python/PostgreSQL examples? Your code should have something like this (untested):
import psycopg2
def storeXmlToPostgres(xmldata):
with psycopg2.connect(host="dbhost", database="dbname", user="username", password="password") as conn:
sql = "INSERT INTO records(organization,code,name) VALUES(%s,%s,%s)"
cur = conn.cursor()
for i in xmldata:
organization = i.find("organization").text
code = i.find("code").text
name = i.find("name").text
cur.execute(sql, [organization, code, name])

Related

Problems Using Django 1.11 with Sql-Server database view

I am trying to use Django (django 1.11.4) to read data from a SQL-Server view (sql server 2012 - I use sql_server.pyodbc [aka django-pyodbc] for this), and nothing seems to work.
Here's my model:
class NumUsersAddedPerWeek(models.Model):
id = models.BigIntegerField(primary_key=True)
year = models.IntegerField('Year')
week = models.IntegerField('Week')
num_added = models.IntegerField('Number of Users Added')
if not settings.RUNNING_UNITTESTS:
class Meta:
managed = False
db_table = 'num_users_added_per_week'
and here's how the database view is created:
create view num_users_added_per_week
as
select row_number() over(order by datepart(year, created_at), datepart(week, created_at)) as 'id',
datepart(year, created_at) as 'year', datepart(week, created_at) as 'week', count(*) as 'num_added'
from [<database name>].[dbo].[<table name>]
where status = 'active' and created_at is not null
group by datepart(year, created_at), datepart(week, created_at)
The view works just fine by itself (e.g., running 'select * from num_users_added_per_week' runs just fine (and very quickly)...
I used the following django command (i.e., 'action') to try 3 different ways of attempting to pull data via the model, and none of them worked (although, judging from other posts, these approaches seemed to work with previous versions of django) :(:
from django.core.management.base import BaseCommand, CommandError
from <project name>.models import NumUsersAddedPerWeek
from django.db import connection
class Command(BaseCommand):
def handle(self, *args, **options):
# attempt # 1 ...
num_users_info = NumUsersAddedPerWeek.objects.all()
info = num_users_info.first()
for info in num_users_info:
print(info)
# attempt # 2 ...
cursor = connection.cursor()
cursor.execute('select * from num_users_added_per_week')
result = cursor.fetchall()
# attempt # 3 ...
num_users_info = NumUsersAddedPerWeek.objects.raw('select * from num_users_added_per_week')
for info in num_users_info:
print(info)
Each of the 3 different approaches gives me the same error: "('42S02', "[42S02] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid object name 'num_users_added_per_week'. (208) (SQLExecDirectW)")"
Please note: my migrations are running just fine - adding class Meta: managed = False is crucial with latest versions of Django in situations where you do not want migrations to create / update / delete your sql table structure...
I figured it out - I have a custom Database Router (in settings.DATABASE_ROUTERS) that I had not properly added this to (I am doing this because the project has multiple databases - see Multi-DB to see why and how to do this). (So boneheaded bug on my part)
But here's what I found out: It turns out all three of the methods I used should work, if you have 1 database in your project. If you have multiple databases then you can query the database through your model object (e.g., <Model Name>.objects.all()) or through raw sql, but you have to specify the raw sql via your model (e.g., <Model Name>.objects.raw(<select * from <view name>)) - otherwise your Database Router will not know which database to use.

Why is Flask-Migrate making me do a 2-steps migration?

I'm working on a project with Flask, SQLAlchemy, Alembic and their wrappers for Flask (Flask-SQLAlchemy and Flask-Migrate). I have four migrations:
1c5f54d4aa34 -> 4250dfa822a4 (head), Feed: Countries
312c1d408043 -> 1c5f54d4aa34, Feed: Continents
41984a51dbb2 -> 312c1d408043, Basic Structure
<base> -> 41984a51dbb2, Init Alembic
When I start a new and clean database and try to run the migrations I get an error:
vagrant#precise32:/vagrant$ python manage.py db upgrade
...
sqlalchemy.exc.ProgrammingError: (ProgrammingError) relation "continent" does not exist
...
If I ask Flask-Migrate to run all migrations but the last, it works. If after that I run the upgrade command again, it works – that is, it fully upgrades my database without a single change in code:
vagrant#precise32:/vagrant$ python manage.py db upgrade 312c1d408043
INFO [alembic.migration] Context impl PostgresqlImpl.
INFO [alembic.migration] Will assume transactional DDL.
INFO [alembic.migration] Running upgrade -> 41984a51dbb2, Init Alembic
INFO [alembic.migration] Running upgrade 41984a51dbb2 -> 312c1d408043, Basic Structure
vagrant#precise32:/vagrant$ python manage.py db upgrade
INFO [alembic.migration] Context impl PostgresqlImpl.
INFO [alembic.migration] Will assume transactional DDL.
INFO [alembic.migration] Running upgrade 312c1d408043 -> 1c5f54d4aa34, Feed: Continents
INFO [alembic.migration] Running upgrade 1c5f54d4aa34 -> 4250dfa822a4, Feed: Countries
TL;DR
The last migration (Feed: Countries) run queries on the table fed by the previous one (Feed: Continents). If I have the continents table create and fed, the scripts should work. But it doesn't.
Why do I have to stop the migration process between then to re-start it in another command? I really don't get this. Is it some command Alembic executes after a serie of migrations? Any ideas?
Just in case
My models are defined as follows:
class Country(db.Model):
__tablename__ = 'country'
id = db.Column(db.Integer, primary_key=True)
alpha2 = db.Column(db.String(2), index=True, unique=True)
title = db.Column(db.String(140))
continent_id = db.Column(db.Integer, db.ForeignKey('continent.id'))
continent = db.relationship('Continent', backref='countries')
def __repr__(self):
return '<Country #{}: {}>'.format(self.id, self.title)
class Continent(db.Model):
__tablename__ = 'continent'
id = db.Column(db.Integer, primary_key=True)
alpha2 = db.Column(db.String(2), index=True, unique=True)
title = db.Column(db.String(140))
def __repr__(self):
return '<Continent #{}: {}>'.format(self.id, self.title)
Many thanks,
UPDATE 1: The upgrade method of the last two migrations
As #Miguel asked in a comment, here there are the upgrade methods of the last two migrations:
Feed: Continents
def upgrade():
csv_path = app.config['BASEDIR'].child('migrations', 'csv', 'en')
csv_file = csv_path.child('continents.csv')
with open(csv_file) as file_handler:
csv = list(reader(file_handler))
csv.pop(0)
data = [{'alpha2': c[0].lower(), 'title': c[1]} for c in csv]
op.bulk_insert(Continent.__table__, data)
Feed: Countries (which depends on the table fed on the last migration)
def upgrade():
# load countries iso3166.csv and build a dictionary
csv_path = app.config['BASEDIR'].child('migrations', 'csv', 'en')
csv_file = csv_path.child('iso3166.csv')
countries = dict()
with open(csv_file) as file_handler:
csv = list(reader(file_handler))
for c in csv:
countries[c[0]] = c[1]
# load countries-continents from country_continent.csv
csv_file = csv_path.child('country_continent.csv')
with open(csv_file) as file_handler:
csv = list(reader(file_handler))
country_continent = [{'country': c[0], 'continent': c[1]} for c in csv]
# loop
data = list()
for item in country_continent:
# get continent id
continent_guess = item['continent'].lower()
continent = Continent.query.filter_by(alpha2=continent_guess).first()
# include country
if continent is not None:
country_name = countries.get(item['country'], False)
if country_name:
data.append({'alpha2': item['country'].lower(),
'title': country_name,
'continent_id': continent.id})
The CSV I'm using are basically following this patterns:
continents.csv
...
AS, "Asia"
EU, "Europe"
NA, "North America"
...
iso3166.csv
...
CL,"Chile"
CM,"Cameroon"
CN,"China"
...
_country_continent.csv_
...
US,NA
UY,SA
UZ,AS
...
So Feed: Continents feeds the continent table, and Feed: Countries feeds the country table. But it has to query the continents table in order to make the proper link between the country and the continent.
UPDATE 2: Some one from Reddit already offered an explanation and a workaround
I asked the same question on Reddit, and themathemagician said:
I've run into this before, and the issue is that the migrations don't
execute individually, but instead alembic batches all of them (or all
of them that need to be run) and then executes the SQL. This means
that by the time the last migration is trying to run, the tables don't
actually exist yet so you can't actually make queries. Doing
from alembic import op
def upgrade():
#migration stuff
op.execute('COMMIT')
#run queries
This isn't the most elegant solution (and that was for Postgres, the
command may be different for other dbs), but it worked for me. Also,
this isn't actually an issue with Flask-Migrate as much as an issue
with alembic, so if you want to Google for more info, search for
alembic. Flask-Migrate is just a wrapper around alembic that works
with Flask-Script easily.
As indicated by #themathemagician on reddit, Alembic by default runs all the migrations in a single transaction, so depending on the database engine and what you do in your migration scripts, some operations that depend on things added in a previous migration may fail.
I haven't tried this myself, but Alembic 0.6.5 introduced a transaction_per_migration option, which might address this. This is an option to the configure() call in env.py. If you are using the default config files as Flask-Migrate creates them, then this is where you fix this in migrations/env.py:
def run_migrations_online():
"""Run migrations in 'online' mode.
# ...
context.configure(
connection=connection,
target_metadata=target_metadata,
transaction_per_migration=True # <-- add this
)
# ...
Also note that if you plan to also run offline migrations you need to fix the configure() call in the run_migrations_offline() in the same way.
Give this a try and let me know if it addresses the problem.

Neo4j and Django

Is ist possible to use neomodel to make models in django?
How do I have to integrate neo4j in django? I'm using Python 3, so neo4django isn't really an option.
I'm new to both of them and at the moment I'm a little confused...
Thanks a lot! :3
Hey neomodel supports python 3 out of the box you can use it with or without django checkout the documentation here: http://neomodel.readthedocs.org/en/latest/
NEOMODEL_NEO4J_BOLT_URL = bolt://neo4j:password#localhost:7687
NEOMODEL_SIGNALS = True
NEOMODEL_FORCE_TIMEZONE = False
NEOMODEL_ENCRYPTED_CONNECTION = True
NEOMODEL_MAX_POOL_SIZE = 50
You need to add the above lines in your settings.py file to use Neo4j.
Hey I know this was asked 8 years ago, but now there's a tool called neomodel which serves this exact purpose. It can also be combined with django-neomodel which allows for neomodel to be easily integrated into your django project.
Using django-neomodel, all you need to do is specify the URL of the database in your settings.py file like so:
NEOMODEL_NEO4J_BOLT_URL = 'bolt://{username}:{password}#{HOSTorIP}'
You can easily create models in your models.py file. Here are some examples for this neomodel documentation:
from neomodel import (StructuredNode, StringProperty,
UniqueIdProperty, IntegerProperty,
RelationshipTo)
class Country(StructuredNode):
code = StringProperty(unique_index=True, required=True)
class Person(StructuredNode):
uid = UniqueIdProperty()
name = StringProperty(unique_index=True)
age = IntegerProperty(index=True, default=0)
# traverse outgoing IS_FROM relations, inflate to Country objects
country = RelationshipTo(Country, 'IS_FROM')
Then you can run python manage.py install_labels to do the equivalent of running migrations, or python manage.py clear_neo4j to clear all nodes from the database.
Nodes can be created like so:
from models import Person
john = Person(name="john", age=23).save()
frank = Person(name="frank", age=50).save()
canada = Country(code=5).save()
And relationships like so:
john.country.connect(canada)
And nodes/relationships can be retrieved as follows:
frank = Person.nodes.get(name='frank')
frank.age += 1
frank.save()
franks_country = frank.country
print(franks_country)
# {'code': 5}

Haystack-Whoosh not indexing any documents

I followed the Haystack tutorial to set up for Whoosh
>>> pip install whoosh
settings.py
import os
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'haystack.backends.whoosh_backend.WhooshEngine',
'PATH': os.path.join(os.path.dirname(__file__), 'whoosh_index'),
},
}
and I am getting an empty list
>>> list(ix.searcher().documents())
[]
Following is my code for searcher_indexes.py
from haystack import indexes
from view_links.models import Projdb
class ProjdbIndex(indexes.SearchIndex, indexes.Indexable):
text = indexes.CharField(document=True, use_template=True)
title = indexes.CharField(model_attr='title')
author = indexes.CharField(model_attr = 'owner')
# pub_date = indexes.DateTimeField(model_attr='date_start')
def get_model(self):
return Projdb
def index_queryset(self,using=None):
"""Used when the entire index for model is updated."""
return self.get_model().objects.all()#filter(pub_date__lte=datetime.datetime.now())
I was previously able to get results for elasticsearch but when I shifted to Whoosh I am getting no results.
Thank you for your time. If you require further information, please let me know.
EDIT:
I am getting results now and here are two things I learned.
I need to register the app whose model is being used for indexing.
If a Model's class is misspelled in search_indexes.py, running the python manage.py rebuild_index does not throw any error and you will get zero indexed objects
Did you run the command?
./manage.py rebuild_index
Do you have any Projdb records?
You have this in your code:
text = indexes.CharField(document=True, use_template=True)
Have you set-up the corresponding template (projdb_text.txt)?

Django + PostgreSQL: How to reset primary key?

I have been working on an application in Django. To begin with, for simplicity, I had been using sqlite3 for the database.
However, once I moved to PostgreSQL, I've run into a bit of a problem: the primary key does not reset once I clear out a table.
This app is a game that is played over a long time period (weeks). As such, every time a new game starts, all of the data is cleared out of the database and then new, randomized data is added.
I'd like to be able to "start over" with primary keys starting at 1 each time I clean/rebuild the game.
The code still works as-is, but integers are a pretty natural way for describing the objects in my game. I'd like to have each new game start at 1 rather than wherever the last game left off.
How can I reset the primary key counter in PostgreSQL? Keep in mind that I don't need to preserve the data in the table since I am wiping it out anyway.
In your app directory try this:
python manage.py help sqlsequencereset
Pipe it into psql like this to actually run the reset:
python manage.py sqlsequencereset myapp1 myapp2 | psql
Edit: here's an example of the output from this command on one of my tables:
BEGIN;
SELECT setval('"project_row_id_seq"', coalesce(max("id"), 1), max("id") IS NOT null) FROM "project_row";
COMMIT;
As suggested by "Van Gale" you can get the commands to solve your problem running sqlsequencereset.
or
You can execute the SQL query generated by sqlsequencereset from within python in this way (using the default database):
from django.core.management.color import no_style
from django.db import connection
from myapps.models import MyModel1, MyModel2
sequence_sql = connection.ops.sequence_reset_sql(no_style(), [MyModel1, MyModel2])
with connection.cursor() as cursor:
for sql in sequence_sql:
cursor.execute(sql)
I tested this code with Python3.6, Django 2.0 and PostgreSQL 10.
If you perform a raw sql, can do this:
ALTER SEQUENCE youApp_id_seq RESTART WITH 1;
docs:
http://www.postgresql.org/docs/8.2/static/sql-altersequence.html
I view auto-increment primary keys as purely internal identifiers for database records, and I don't like exposing them to users. Granted, it's a common design to use them as part of URLs, but even there slugs or other identifiers feel more appropriate.
If you do not want to have to manually grab the apps you need, or if you have a series of different databases, this command will dynamically gather all connections from settings.py and reset the sequence.
To run use: python manage.py reset_sequences
import psycopg2
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import connections
def dictfetchall(cursor):
"""Return all rows from a cursor as a dict"""
columns = [col[0] for col in cursor.description]
return [
dict(zip(columns, row))
for row in cursor.fetchall()
]
class Command(BaseCommand):
help = "Resets sequencing errors in Postgres which normally occur due to importing/restoring a DB"
def handle(self, *args, **options):
# loop over all databases in system to figure out the tables that need to be reset
for name_to_use_for_connection, connection_settings in settings.DATABASES.items():
db_name = connection_settings['NAME']
host = connection_settings['HOST']
user = connection_settings['USER']
port = connection_settings['PORT']
password = connection_settings['PASSWORD']
# connect to this specific DB
conn_str = f"host={host} port={port} user={user} password={password}"
conn = psycopg2.connect(conn_str)
conn.autocommit = True
select_all_table_statement = f"""SELECT *
FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY table_name;
"""
# just a visual representation of where we are
print('-' * 20, db_name)
try:
not_reset_tables = list()
# use the specific name for the DB
with connections[name_to_use_for_connection].cursor() as cursor:
# using the current db as the cursor connection
cursor.execute(select_all_table_statement)
rows = dictfetchall(cursor)
# will loop over table names in the connected DB
for row in rows:
find_pk_statement = f"""
SELECT k.COLUMN_NAME
FROM information_schema.table_constraints t
LEFT JOIN information_schema.key_column_usage k
USING(constraint_name,table_schema,table_name)
WHERE t.constraint_type='PRIMARY KEY'
AND t.table_name='{row['table_name']}';
"""
cursor.execute(find_pk_statement)
pk_column_names = dictfetchall(cursor)
for pk_dict in pk_column_names:
column_name = pk_dict['column_name']
# time to build the reset sequence command for each table
# taken from django: https://docs.djangoproject.com/en/3.0/ref/django-admin/#sqlsequencereset
# example: SELECT setval(pg_get_serial_sequence('"[TABLE]"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "[TABLE]";
try:
reset_statement = f"""SELECT setval(pg_get_serial_sequence('"{row['table_name']}"','{column_name}'),
coalesce(max("{column_name}"), 1), max("{column_name}") IS NOT null) FROM "{row['table_name']}" """
cursor.execute(reset_statement)
return_values = dictfetchall(cursor)
# will be 1 row
for value in return_values:
print(f"Sequence reset to {value['setval']} for {row['table_name']}")
except Exception as ex:
# will only fail if PK is not an integer...
# currently in my system this is from django.contrib.sessions
not_reset_tables.append(f"{row['table_name']} not reset")
except psycopg2.Error as ex:
raise SystemExit(f'Error: {ex}')
conn.close()
print('-' * 5, ' ALL ERRORS ', '-' * 5)
for item_statement in not_reset_tables:
# shows which tables produced errors, so far I have only
# seen this with PK's that are not integers because of the MAX() method
print(item_statement)
# just a visual representation of where we are
print('-' * 20, db_name)
You need to truncate the table.
See http://www.postgresql.org/docs/8.1/static/sql-truncate.html