Vertex AI batch prediction results in ('ValueError', 1) - google-cloud-platform

I have a predictor that works successfully when run on a local container, but when that container is pushed to the Artifact Registry, and imported to a Vertex AI model, and run in Batch Prediction mode, it returns an empty prediction file and an error file that simply says ('ValueError', 1)
The request is made using Flask.
code (main.py) is
if 'AIP_HEALTH_ROUTE' in os.environ:
AIP_HEALTH_ROUTE = os.environ['AIP_HEALTH_ROUTE']
else:
AIP_HEALTH_ROUTE = '/health'
if 'AIP_PREDICT_ROUTE' in os.environ:
AIP_PREDICT_ROUTE = os.environ['AIP_PREDICT_ROUTE']
else:
AIP_PREDICT_ROUTE = '/predict'
#app.route(AIP_HEALTH_ROUTE, methods=['GET'])
def health():
response = {"message": "OK", "code": "SUCCESS"}
return make_response(jsonify(response), 200)
#app.route(AIP_PREDICT_ROUTE, methods=['POST'])
def predict():
try:
instances = pd.DataFrame(request.json.get("instances"))
# creates a model using model and encoders stored at given GCS location
model = model_active.CustomPredictor('datascience-modelartifacts/my-model/actives')
processed_data = model.process_score_data(instances)
predictions = model.predict(processed_data)
response = {"predictions": predictions}
status = 200
except Exception as e:
response = {"error": str(e)}
status = 500
return make_response(jsonify(response), status)
For local testing, the data looks like:
dat = {"instances": [
{"F1": 1000854828492, "F2": 3000076437878, "F3": 19.99, ...},
{"F1": 1000222326963, "F2": 3000127917915, "F3": 19.99, ...},
... ]}
I call this with
import requests
url = 'http://localhost:5000/predict'
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
r = requests.post(url, json=dat, headers=headers)
and in this case everything works as expected (I get a list of predicted classes returned).
I can successfully deploy this model as an endpoint and provide this data as test and get predictions returned.
However, for Vertex batch prediction (which is what I ultimately want to enable since we only need this model once a month), I have tried providing this data as both JSONL and CSV and I keep receiving output with no successful predictions and an error file that simply says ('ValueError', 1)
Does anyone have any suggestions what this error means and what is causing it?

Related

DRF test multiple image upload

I'm trying to test multiple image upload to my server. Here is the serializer:
class ImageSerializer(serializers.ModelSerializer):
image = serializers.ListField(
child=serializers.ImageField(allow_empty_file=True)
)
ImageFactory:
def get_image():
image = Image.new("RGB", (2000, 2000))
file = tempfile.NamedTemporaryFile(suffix=".jpg")
image.save(file)
return file
Test:
def test_upload_multiple_images(self):
self.image = get_image()
with open(self.image.name, "rb") as file:
payload = {
"image": [file, file]
}
response = self.client.post(
reverse("gallery-list", args=[self.item.pk]),
data=payload,
format="multipart"
)
When testing via Postman, images from the array are saved correctly. However when using the test case, I get the following message from the response:
{'image': [{'message': 'The submitted file is empty.', 'code': 'empty'}]}
Before adding allow_empty_file=True, there were two of those messages being returned.
Has anyone got any idea why that would happen?
The problem here is that you're sending the same Image object to the database to be saved twice. Once the first Image object has been decoded to a file to be saved in the database, the Image object will become unreadable, and since its the same object as the second, an error will be thrown.
So if you send the same Image object as two different items, only the first one will be readable. To avoid this you'll have to send two different image objects.
Your ImageFactory can be refactored to:
def get_image(count=1):
images = []
for i in list(range(count)):
file = io.BytesIO()
image = Image.new('RGBA', size=(100, 100), color=(155, 0, 0))
image.save(file, 'png')
file.name = 'test.png'
file.seek(0)
images.append(file)
return images[0] if count == 1 else images
and your Test:
def test_upload_multiple_images(self):
self.images = get_image(2)
payload = {
"image": self.images
}
response = self.client.post(
reverse("gallery-list", args=[self.item.pk]),
data=payload,
format="multipart"
)

Mocking external API in Django

I am trying to mock external api in Django but not sure how to do it properly.
Basically, it must mock the json data from external API and then create a new object if all values are valid.
The program fetches the geolocation data based on given IP address and saves the object in database if response data includes all required fields. So, how I can mock this process to test a new object creation?
services.py
import os
import requests
from .exceptions import ExternalApiException
def get_location(ip):
url = f'http://api.ipstack.com/{ip}'
params = {'access_key': os.environ.get('ACCESS_KEY')}
try:
res = requests.get(url, params=params)
data = res.json()
return {
'ip':data['ip'],
'country_name':data['country_name'],
'region_code':data['region_code'],
'city':data['city'],
'latitude':data['latitude'],
'longitude':data['longitude'],
'zip_code':data['zip']
}
except requests.exceptions.ConnectionError:
raise ExternalApiException('Connection error occured during the fetch process')
except requests.exceptions.Timeout:
raise ExternalApiException("Connection timeout. Please check your internet connection and try again later")
except requests.exceptions.TooManyRedirects:
raise ExternalApiException("Too many redirects")
except requests.exceptions.RequestException:
raise SystemExit(e)
tests.py
#I am lost in this part
#patch('geolocation.services.get_location')
def test_create_basic_geolocation(self, mock_request):
"""Test creating geolocation data"""
payload = {
'ip': '',
}
res = self.client.post(LOCATIONS_URL, payload)
self.assertTrue(res.data['ip'])
Thanks for any help.
Just assign return_value on mocked instance like this
#patch('geolocation.services.get_location')
def test_create_basic_geolocation(self, mock_request):
"""Test creating geolocation data"""
mock_request.return_value = {"ip": "hello", "country_name": "test"}
payload = {
'ip': '',
}
res = self.client.post(LOCATIONS_URL, payload)
self.assertTrue(res.data['ip'])

Request in flask API deployed in Cloud Run doesn't work

I have an application in Flask that should receive an image per POST request. The request works with the server running locally, but it does not work on my API implemented by Cloud Run :(
I would appreciate any pointers!
Regards, Helena.
#app.route("/predict", methods=["GET", "POST"])
def predict():
# initialize the data dictionary that will be returned from the view
data = {"success": False}
# ensure an image was properly uploaded to our endpoint
if flask.request.method == 'POST':
_file = flask.request.files['image']
if _file:
# read the image in PIL format
thermal_image = flask.request.files['image'].read()
# get thermal region on embedded image
try:
image = get_cropped_embedded(thermal_image)
#print(image)
except Exception as e:
print(e)
data = {"Success": 'Be positive!'}
return flask.jsonify(data)
################ More code
data.update({
"path": path,
"success": True
})
else:
logging.info("No hot point found out. Exitting without classification.")
return flask.jsonify(data)
# return the data dictionary as a JSON response
return flask.jsonify(data)
if __name__ == "__main__":
app.run('0.0.0.0', port=5000)
desired result
result

Test cases for Django Rest Framework; struggling to get a correct response

Update: Solved my own problem: I've learnt that Django creates its own test database, and as such, it needs to be populated with data. Ran my importer in my test cases and it all worked. So, if you're also wondering why you're tests don't work, check that you've got some data in the test db!
End Update
I am writing tests for my Django Rest Framework API but I am struggling to get my code to return a 200 OK. At the moment, my test case continually returns a 404 Not Found.
I'm in the early stages of writing tests, and have a lot to learn. I'm currently following https://www.django-rest-framework.org/api-guide/testing/
I'm trying to test an endpoint at the following URL
# Not shown here, is that all URLs here will be prepended with /api/v1
path('case/<int:pk>/', EntireCaseView.as_view(), name='case'),
I have an object in my database with an ID (primary key) of 1. I can successful query the API by going to http://localhost:8000/api/v1/case/1/
I receive a valid JSON response (Trampe is a rabbit)
{
"id": 1,
"total_points": 5000,
"passing_points": 3700,
"budget": 5000,
"description": "Saving Trampe from Trauma",
"name": "Trampe",
"signalment": "8yr, intact male, mixed breed.",
"problem": "Respiratory difficulty",
"image": {
"id": 1,
"file": "http://localhost:8000/media/images/trampe.jpg",
"description": "A lovely picture of Trampe"
},
My API requires authentication, and as such I am providing authentication in my test case.
class CaseTests(APITestCase):
def test_status_code(self):
"""
ensure that case/1 returns 200 OK
"""
# Create a test user
test_user = User(username='jim', password='monkey123', email='jim#jim.com')
test_user.save()
# build a factory and get our user Jim
factory = APIRequestFactory()
user = User.objects.get(username='jim')
# Get our view to test and the url, too
view = EntireCaseView.as_view()
url = reverse('case', kwargs={'pk': '1'})
print(url.__str__())
# Make an authenticated request to the view...
request = factory.get(url)
print(request.get_full_path())
force_authenticate(request, user=user)
response = view(request, "1")
print(response.data)
self.assertEqual(response.status_code, status.HTTP_200_OK)
Of interest here (at least to me) are the lines
url = reverse('case', kwargs={'pk': '1'})
and
response = view(request, "1")
If I leave out either the kwargs argument in url =r everse('case', kwargs={'pk': '1'}) or the "1" in response = view(request, "1") I will receive an error saying that the get() method requires 2 positional arguments but only given.
Here is the signature of the get() method in my view.
class EntireCaseView(APIView):
def get(self, request, pk):
If I run my test, Django reports that it fails because of a 404.
self.assertEqual(response.status_code, status.HTTP_200_OK)
AssertionError: 404 != 200
What I am trying to work out is why this is the case. Printing print(url.__str__()) outputs /api/v1/case/1/ as does print(request.get_full_path())
So in summary, I am trying to understand why I'm receiving this 404, and ultimately, how I can test this, and other endpoints.
Any and all help is appreciated.
Cheers,
C

Method works fine in iPython but runs endlessly on Gunicorn

I wrote an app in Falcon framework that I am running using the Gunicorn server. When the server starts, the app first learns random forest model:
forest = sklearn.ensemble.ExtraTreesClassifier(n_estimators=150, n_jobs=-1)
forest.fit(x, t)
and then returns probabilities for requests posted to it. This works fine on my server when I run the code in iPython (training this model takes 15s, running on 12 cores).
When I was writing the app, i set n_estimators=10 and everything was working. When I finished tweaking the app, I set n_estimators back to 150. However, when I ran Gunicorn then with gunicorn -c ./app.conf app:app, from htop I could see the the forest.fit(x, t) runs for few seconds on all cores, after which the usage of all cores drops to 0. After that, the method keeps running indefinitely until the Gunicorn worker timeouts after 10 minutes.
This is my first time using Gunicorn and Falcon, or any WSGI technologies for that matter, and I am clueless as to what might be causing the problem or how to troubleshoot it.
Edit:
The settings file for gunicorn:
# app.conf
# run with gunicorn -c ./app.conf app:app
import sys
sys.path.append('/home/user/project/Module')
bind = "127.0.0.1:8123"
timeout = 60*20 # Timeout worker after more than 20 minutes`
The falcon code:
class Processor(object):
""" Processor object handles the training of the models,
feature generation of requests and probability predictions.
"""
# data statistics used in feature calculations
data_statistics = {}
# Classification targets
targets = ()
# Select features for the models.
cols1 = [ #...
]
cols2 = [ #...
]
model_1 = ExtraTreesClassifier(n_estimators=150, n_jobs=-1)
model_2 = ExtraTreesClassifier(n_estimators=150, n_jobs=-1)
def __init__(self, features_dataset, tr_prepro):
# Get the datasets
da_1, da_2 = self.prepare_datasets(features_dataset)
# Train models
# ----THIS IS WHERE THE PROGRAM HANGS -----------------------------------
self.model_1.fit(da_1.x, utils.vectors_to_labels(da_1.t))
# -----------------------------------------------------------------------
self.model_2.fit(da_2.x, utils.vectors_to_labels(da_2.t))
# Generate data statistics for feature calculations
self.calculate_data_statistics(tr_prepro)
def prepare_datasets(self, features_dataset):
sel_cols = [ #...
]
# Build dataset
d = features_dataset[sel_cols].dropna()
da, scalers = ft.build_dataset(d, scaling='std', target_feature='outcome')
# Binirize data
da_bin = utils.binirize_dataset(da)
# What are the classification targets
self.targets = da_bin.t_labels
# Prepare the datasets
da_1 = da_bin.select_attributes(self.cols1)
da_2 = da_bin.select_attributes(self.cols2)
return da_1, da_2
def calculate_data_statistics(self, tr_prepro):
logger.info('Getting data and feature statistics...')
#...
logger.info('Done.')
def import_data(self, data):
# convert dictionary generated from json to Pandas DataFrame
return tr
def generate_features(self, tr):
# Preprocessing, Feature calculations, imputation
return tr
def predict_proba(self, data):
# Convert Data
tr = self.import_data(data)
# Generate features
tr = self.generate_features(tr)
# Select model based on missing values - either no. 1 or no. 2
tr_1 = #...
tr_2 = #...
# Get the probabilities from different models
if tr_1.shape[0] > 0:
tr_1.loc[:, 'prob'] = self.model_1.predict_proba(tr_1.loc[:, self.cols1])[:, self.targets.index('POSITIVE')]
if tr_2.shape[0] > 0:
tr_2.loc[:, 'prob'] = self.model_2.predict_proba(tr_2.loc[:, self.cols2])[:, self.targets.index('POSITIVE')]
return pd.concat([tr_1, tr_2], axis=0)
#staticmethod
def export_single_result(tr):
result = {'sample_id': tr.loc[0, 'sample_id'],
'batch_id': tr.loc[0, 'batch_id'],
'prob': tr.loc[0, 'prob']
}
return result
class JSONTranslator(object):
def process_request(self, req, resp):
"""Generic method for extracting json from requets
Throws
------
HTTP 400 (Bad Request)
HTTP 753 ('Syntax Error')
"""
if req.content_length in (None, 0):
# Nothing to do
return
body = req.stream.read()
if not body:
raise falcon.HTTPBadRequest('Empty request body',
'A valid JSON document is required.')
try:
req.context['data'] = json.loads(body.decode('utf-8'))
except (ValueError, UnicodeDecodeError):
raise falcon.HTTPError(falcon.HTTP_753,
'Malformed JSON',
'Could not decode the request body. The '
'JSON was incorrect or not encoded as '
'UTF-8.')
def process_response(self, req, resp, resource):
"""Generic method for putting response to json
Does not do anything if 'result_json' not in req.context.
"""
if 'result_json' not in req.context:
return
resp.body = json.dumps(req.context['result_json'])
class ProbResource(object):
def __init__(self, processor):
self.schema_raw = open(config.__ROOT__ + "app_docs/broadcast_schema.json").read()
self.schema = json.loads(self.schema_raw)
self.processor = processor
def validate_request(self, req):
""" Validate the request json against the schema.
Throws
------
HTTP 753 ('Syntax Error')
"""
data = req.context['data']
# validate the json
try:
v = jsonschema.Draft4Validator(self.schema) # using jsonschema draft 4
err_msg = str()
for error in sorted(v.iter_errors(data), key=str):
err_msg += str(error)
if len(err_msg) > 0:
raise falcon.HTTPError(falcon.HTTP_753,
'JSON failed validation',
err_msg)
except jsonschema.ValidationError as e:
print("Failed to use schema:\n" + str(self.schema_raw))
raise e
def on_get(self, req, resp):
"""Handles GET requests
Throws
------
HTTP 404 (Not Found)
"""
self.validate_request(req)
data = req.context['data']
try:
# get probability
tr = self.processor.predict_proba(data)
# convert pandas dataframe to dictionary
result = self.processor.export_single_result(tr)
# send the dictionary away
req.context['result_json'] = result
except Exception as ex:
raise falcon.HTTPError(falcon.HTTP_404, 'Error', ex.message)
resp.status = falcon.HTTP_200
# Get data
features_data = fastserialize.load(config.__ROOT__ + 'data/samples.s')
prepro_data = fastserialize.load(config.__ROOT__ + 'data/prepro/samples_preprocessed.s')
# Get the models - this is where the code hangs
sp = SampleProcessor(features_data, prepro_data)
app = falcon.API(middleware=[JSONTranslator()])
prob = ProbResource(sp)
app.add_route('/prob', prob)