boto2 dynamodb put_item always creates a new item - python-2.7

I am using boto 2.45 and Python 2.7. Here is the code:
import time
import boto
from boto.dynamodb2.layer1 import DynamoDBConnection
from boto.dynamodb2.table import Table
from boto.dynamodb2.fields import HashKey, RangeKey, KeysOnlyIndex,GlobalAllIndex
access_key = 'XXX'
secret_key = 'YYY'
conn = boto.dynamodb2.connect_to_region(
'us-east-2',
aws_access_key_id=access_key,
aws_secret_access_key=secret_key
)
table = Table('Table1', connection=conn);
table.put_item(data={
'name':'aaa.rar',
'time':int(time.time()),
'size':200000000000,
'title':'from boto'
})
Everytime I run this code, it always creates a new item in the table with name 'aaa.rar'. Table's primary key is name. This is not what I want. I want to raise an exception if an item with the same name already exists in the table.

Related

How can I save the output of an API call to DynamoDB?

I am calling a REST API from lambda function in AWS . I receive the response in the terminal. How can I store these API responses in DynamoDB. I am sorry if this may seem naive to some. I am a beginner and self learner.
Here is a full working example for a Python based Lambda function:
# Import AWS (Python version) SDK
import boto3
def lambda_handler(event, context):
# Construct the client to the DynamoDB service
client = boto3.client('dynamodb')
db = boto3.resource('dynamodb')
table_name = 'my-awesome-table'
# Table instance
table = db.Table(table_name)
# Example data to query an entry from the table
primary_column_name = 'email'
primary_key = 'john#gmail.com'
# We get an object for the retrieved table entry
table_entry = get_table_item(table, primary_column_name, primary_key)
# Example data to put into the table
new_entry = {
'email': 'jack#outlook.com',
'name': 'Jack Ma',
'age': 56
}
put_table_item(table, new_entry)
# Function to retrieve an entry from the DynamoDB table
def get_table_item(table, primary_column_name, primary_key):
response = table.get_item(
Key={
primary_column_name: primary_key
}
)
return response['Item']
# Function to put an entry into the DynamoDB table
def put_table_item(table, item):
response = table.put_item(
Item=item
)
# If there were errors, throw an exception
if (response['ResponseMetadata']['HTTPStatusCode'] != 200):
raise Exception('Failed to update table!')

Describe Images(AMI) with tag values using boto3

I have a boto3 script that describes images.
However, I want to customize the script so that it shows me only the value of the tag that is the Name of the AMI.
Currently its displaying:
([{u'Value': 'NAME OF AMI', u'Key': 'Name'}], 'ami-xxxxxxxxxxxxxx', 'available')
But I want something like this:
**Name :'NAME OF AMI', 'ami-xxxxxxxxxxxxxx', 'available'**
Here is my script:
import boto3
import sys
import pprint
ec2 = boto3.resource('ec2')
client = boto3.client('ec2')
response = client.describe_images(Owners=['self'])
for ami in response['Images']:
print (ami['Tags'], ami['ImageId'], ami['State'])
There can be multiple Tags on an AMI. This code will display the Value of the tag that has a Key of Name (and it will also show the Name of the AMI that normally appears as "AMI Name" in the console, which is a different name!):
import boto3
import sys
import pprint
ec2 = boto3.resource('ec2')
client = boto3.client('ec2')
response = client.describe_images(Owners=['self'])
for ami in response['Images']:
if 'Tags' in ami:
name = [tag['Value'] for tag in ami['Tags'] if tag['Key'] == 'Name'][0]
else:
name = ''
print (name, ami['Name'], ami['ImageId'], ami['State'])

Required parameter is missing error while writing to bigQuery with google.cloud.bigquery in Python

I am loading a New Line Delimited JSON to bigQuery using the following code snippet in Python 2.7:
from google.cloud import bigquery
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
bigquery_client = bigquery.Client()
dataset = bigquery_client.dataset('testGAData')
table_ref = dataset.table('gaData')
table = bigquery.Table(table_ref)
with open('gaData.json', 'rb') as source_file:
job_config = bigquery.LoadJobConfig()
job_config.source_format = 'NEWLINE_DELIMITED_JSON'
job = bigquery_client.load_table_from_file(
source_file, table, job_config=job_config)
It returns me the following error:
File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/google/cloud/bigquery/client.py", line 897, in load_table_from_file
raise exceptions.from_http_response(exc.response)
google.api_core.exceptions.BadRequest: 400 POST https://www.googleapis.com/upload/bigquery/v2/projects/test-project-for-experiments/jobs?uploadType=resumable: Required parameter is missing
Why am I getting this error? How can I fix this? Has anyone else faced a similar issue? Thanks in advance.
Edit: Added last para, included python imports and corrected the indents.
Issues observed with the initial code
You are missing the schema for your table. You can either use job_config.autodetect = True or job_config.schema = [bigquery.SchemaField("FIELD NAME", "FIELD TYPE")].
From the documentation, you should set job_config.source_format = `bigquery.SourceFormat.NEWLINE_DELIMITED_JSON` for a JSON file source
You should pass yourtable_ref variable as an argument instead your table variable in bigquery_client.load_table_from_file(source_file, table, job_config=job_config)
Link to the documentation
Working Code
The below code works for me. I am using python 3 and google-cloud-bigquery v1.5
from google.cloud import bigquery
client = bigquery.Client()
dataset_id, table_id = "TEST_DATASET", "TEST_TABLE"
data_ref = client.dataset(dataset_id)
table_ref = data_ref.table(table_id)
file_path = "path/to/test.json"
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
#job_config.autodetect = True
job_config.schema = [bigquery.SchemaField("Name", "STRING"), bigquery.SchemaField("Age", "INTEGER")]
with open(file_path, 'rb') as source_file:
job = client.load_table_from_file(source_file, table_ref, location='US', job_config=job_config)
job.result()
print('Loaded {} rows into {}:{}.'.format(job.output_rows, dataset_id, table_id))
Output
>> Loaded 2 rows into TEST_DATASET:TEST_TABLE.

Python Script to query from DynamoDb not giving all items

I have written following python code to fetch data from a table but its not fetching all the items as I want. When I check on AWS console page of DynamoDb, I can see much more entries as compared to what I get from script.
from __future__ import print_function # Python 2/3 compatibility
import boto3
import json
import decimal
from datetime import datetime
from boto3.dynamodb.conditions import Key, Attr
import sys
# Helper class to convert a DynamoDB item to JSON.
class DecimalEncoder(json.JSONEncoder):
def default(self, o):
if isinstance(o, decimal.Decimal):
if o % 1 > 0:
return float(o)
else:
return int(o)
return super(DecimalEncoder, self).default(o)
dynamodb = boto3.resource('dynamodb', aws_access_key_id = '',
aws_secret_access_key = '',
region_name='eu-west-1', endpoint_url="http://dynamodb.eu-west-1.amazonaws.com")
mplaceId = int(sys.argv[1])
table = dynamodb.Table('XYZ')
response = table.query(
KeyConditionExpression=Key('mplaceId').eq(mplaceId)
)
print('Number of entries found ', len(response['Items']))
I did the same thing from aws console also. Query by mplaceId.
Any reason why its happening?
dynamodb.Table.query() returns at max 1MB of data. From the boto3 documentation:
A single Query operation will read up to the maximum number of items set (if using the Limit parameter) or a maximum of 1 MB of data and then apply any filtering to the results using FilterExpression. If LastEvaluatedKey is present in the response, you will need to paginate the result set. For more information, see Paginating the Results in the Amazon DynamoDB Developer Guide .
That's actually no boto3-limitation, but a limitation of the underlying query-API.
Instead of implementing pagination yourself, you can use boto3's built-in pagination . Here is an example showing the use of the paginator for querying DynamoDB tables provided by boto3:
import boto3
from boto3.dynamodb.conditions import Key
dynamodb_client = boto3.client('dynamodb')
paginator = dynamodb_client.get_paginator('query')
page_iterator = paginator.paginate(
TableName='XYZ',
KeyConditionExpression='mplaceId = :mplaceId',
ExpressionAttributeValues={':mplaceId': {'S' : mplaceid}}
)
for page in page_iterator:
print(page['Items'])

How to reset the sequence for IDs on PostgreSQL tables

I recently imported a lot of data from an old database into a new Postgresql database as the basis for models in a new Django site.
I used the IDs from the old database (as rows in various tables refer to each other), but they aren't all sequential - there are often large gaps.
I've noticed that when I add a new object via the Django app, then it has been using IDs starting from 1, which hasn't been a problem so far as there were no rows with very low IDs.
But once it reaches the first row of legacy data, then postgres obviously complains:
ERROR: duplicate key value violates unique constraint "django_comments_pkey"
DETAIL: Key (id)=(25) already exists.
Looking at the table descriptions I'm guessing I need to reset some kind of sequence on each table:
Table "public.django_comments"
Column | Type | Modifiers
-----------------+--------------------------+--------------------------------------------------------------
id | integer | not null default nextval('django_comments_id_seq'::regclass)
...
What do I need to do to reset that sequence, so that new rows are added with IDs higher than the current maximum ID?
Run sqlsequencereset and it'll print all the reset commands you need.
As suggested by "Dmitry Shevchenko" you can run sqlsequencereset to solve your problem.
or
You can execute the SQL query generated by sqlsequencereset from within python in this way (using the default database):
from django.core.management.color import no_style
from django.db import connection
from myapps.models import MyModel1, MyModel2
sequence_sql = connection.ops.sequence_reset_sql(no_style(), [MyModel1, MyModel2])
with connection.cursor() as cursor:
for sql in sequence_sql:
cursor.execute(sql)
I tested this code with Python3.6, Django 2.0 and PostgreSQL 10.
Here's a short snippet to reset all sequences in Django 1.9+ (based on http://djangosnippets.org/snippets/2774/) and compatible with Python 3:
import os
from io import StringIO
os.environ['DJANGO_COLORS'] = 'nocolor'
from django.core.management import call_command
from django.apps import apps
from django.db import connection
commands = StringIO()
cursor = connection.cursor()
for app in apps.get_app_configs():
label = app.label
call_command('sqlsequencereset', label, stdout=commands)
cursor.execute(commands.getvalue())
So the quickest, easiest and most "Django" way to do this in my opinion is to use the following management command:
python manage.py sqlsequencereset app_name
After this, you'll get something such as:
BEGIN;
SELECT setval(pg_get_serial_sequence('"measurements_quantity"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "measurements.Quantities";
SELECT setval(pg_get_serial_sequence('"measurements.Prefixes"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "measurements.Prefixes";
COMMIT;
The next step is to run this in the python manage.py dbshell management command, so run this and then you'll see the interaction database shell in your terminal:
psql (11.7 (Debian 11.7-0+deb10u1), server 11.5 (Debian 11.5-1.pgdg90+1))
Type "help" for help.
postgres=# BEGIN;
BEGIN
postgres=# SELECT setval(pg_get_serial_sequence('"measurements.Quantities"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "measurements.Quantities";
setval
--------
1
(1 row)
postgres=# SELECT setval(pg_get_serial_sequence('"measurements.Prefixes"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "measurements.Prefixes";
setval
--------
1
(1 row)
postgres=# COMMIT;
COMMIT
postgres=# exit
Simple as that. The python manage.py sqlsequencereset app_name command will give you the SQL you need to run, and you run it in the dbshell.
No writing your own custom SQL or custom code and it will give you what you need in the correct format and db engine of choice.
PostgreSQL Command:
ALTER SEQUENCE app_model_id_seq RESTART WITH 1
select setval('django_comments_id_seq', 12345);
This snippet Run sqlsequencereset on all apps reset all IDs of all Empty Models
Here is a more-or-less completely dynamic solution I just implemented in a management command that has no restriction as to the name of the Primary Key you are attempting to reset as it gathers it based on the connection params you have in settings.
The only sequencing I could not reset included PKs that are not integers, which is apparent in the PK for django.contrib.sessions, but again I have never run into sequencing errors with that so I doubt it is an issue.
Here is the command, run using python manage.py reset_sequences (obviously as long as your file/command is named reset_sequences.py)
import psycopg2
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import connections
def dictfetchall(cursor):
"""Return all rows from a cursor as a dict"""
columns = [col[0] for col in cursor.description]
return [
dict(zip(columns, row))
for row in cursor.fetchall()
]
class Command(BaseCommand):
help = "Resets sequencing errors in Postgres which normally occur due to importing/restoring a DB"
def handle(self, *args, **options):
# loop over all databases in system to figure out the tables that need to be reset
for name_to_use_for_connection, connection_settings in settings.DATABASES.items():
db_name = connection_settings['NAME']
host = connection_settings['HOST']
user = connection_settings['USER']
port = connection_settings['PORT']
password = connection_settings['PASSWORD']
# connect to this specific DB
conn_str = f"host={host} port={port} user={user} password={password}"
conn = psycopg2.connect(conn_str)
conn.autocommit = True
select_all_table_statement = f"""SELECT *
FROM information_schema.tables
WHERE table_schema = 'public'
ORDER BY table_name;
"""
# just a visual representation of where we are
print('-' * 20, db_name)
try:
not_reset_tables = list()
# use the specific name for the DB
with connections[name_to_use_for_connection].cursor() as cursor:
# using the current db as the cursor connection
cursor.execute(select_all_table_statement)
rows = dictfetchall(cursor)
# will loop over table names in the connected DB
for row in rows:
find_pk_statement = f"""
SELECT k.COLUMN_NAME
FROM information_schema.table_constraints t
LEFT JOIN information_schema.key_column_usage k
USING(constraint_name,table_schema,table_name)
WHERE t.constraint_type='PRIMARY KEY'
AND t.table_name='{row['table_name']}';
"""
cursor.execute(find_pk_statement)
pk_column_names = dictfetchall(cursor)
for pk_dict in pk_column_names:
column_name = pk_dict['column_name']
# time to build the reset sequence command for each table
# taken from django: https://docs.djangoproject.com/en/3.0/ref/django-admin/#sqlsequencereset
# example: SELECT setval(pg_get_serial_sequence('"[TABLE]"','id'), coalesce(max("id"), 1), max("id") IS NOT null) FROM "[TABLE]";
try:
reset_statement = f"""SELECT setval(pg_get_serial_sequence('"{row['table_name']}"','{column_name}'),
coalesce(max("{column_name}"), 1), max("{column_name}") IS NOT null) FROM "{row['table_name']}" """
cursor.execute(reset_statement)
return_values = dictfetchall(cursor)
# will be 1 row
for value in return_values:
print(f"Sequence reset to {value['setval']} for {row['table_name']}")
except Exception as ex:
# will only fail if PK is not an integer...
# currently in my system this is from django.contrib.sessions
not_reset_tables.append(f"{row['table_name']} not reset")
except psycopg2.Error as ex:
raise SystemExit(f'Error: {ex}')
conn.close()
print('-' * 5, ' ALL ERRORS ', '-' * 5)
for item_statement in not_reset_tables:
# shows which tables produced errors, so far I have only
# seen this with PK's that are not integers because of the MAX() method
print(item_statement)
# just a visual representation of where we are
print('-' * 20, db_name)
based on #Paolo Melchiorre I created a custom management command, which populates all the models from chosen apps.
from django.core.management.base import BaseCommand
from django.apps import apps
from django.core.management.color import no_style
from django.db import connection
class Command(BaseCommand):
def handle(self, *args, **kwargs):
self.stdout.write('Reset AutoFields ...')
APPS = ['app1', 'app2']
APPS = [apps.get_app_config(app) for app in APPS]
models = []
for app in APPS:
models.extend(list(app.get_models()))
sequence_sql = connection.ops.sequence_reset_sql(no_style(), models)
with connection.cursor() as cursor:
for sql in sequence_sql:
self.stdout.write(sql)
cursor.execute(sql)
self.stdout.write(self.style.SUCCESS('Reset AutoField complete.'))
tested using python 3.7 and django 2.2.