Is there any way in Pandas to capture the warning produced by setting error_bad_lines = False and warn_bad_lines = True? For instance the following script:
import pandas as pd
from StringIO import StringIO
data = StringIO("""a,b,c
1,2,3
4,5,6
6,7,8,9
1,2,5
3,4,5""")
pd.read_csv(data, warn_bad_lines=True, error_bad_lines=False)
produces the warning:
Skipping line 4: expected 3 fields, saw 4
I'd like to store this output to a string so that I can eventually write it to a log file to keep track of records that are being skipped.
I tried using the warning module but it doesn't appear as though this "warning" is of the traditional sense. I'm using Python 2.7 and Pandas 0.16.
I think it isn't implemented to pandas.
source1, source2
My solutions:
1. Pre or after processing
import pandas as pd
import csv
df = pd.read_csv('data.csv', warn_bad_lines=True, error_bad_lines=False)
#compare length of rows by recommended value:
RECOMMENDED = 3
with open('data.csv') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
for row in reader:
if (len(row) != RECOMMENDED):
print ("Length of row is: %r" % len(row) )
print row
#compare length of rows by length of columns in df
lencols = len(df.columns)
print lencols
with open('data.csv') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
for row in reader:
if (len(row) != lencols):
print ("Length of row is: %r" % len(row) )
print row
2. Replaces sys.stdout
import pandas as pd
import os
import sys
class RedirectStdStreams(object):
def __init__(self, stdout=None, stderr=None):
self._stdout = stdout or sys.stdout
self._stderr = stderr or sys.stderr
def __enter__(self):
self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
self.old_stdout.flush(); self.old_stderr.flush()
sys.stdout, sys.stderr = self._stdout, self._stderr
def __exit__(self, exc_type, exc_value, traceback):
self._stdout.flush(); self._stderr.flush()
sys.stdout = self.old_stdout
sys.stderr = self.old_stderr
if __name__ == '__main__':
devnull = open('log.txt', 'w')
#replaces sys.stdout, sys.stderr, see http://stackoverflow.com/a/6796752/2901002
with RedirectStdStreams(stdout=devnull, stderr=devnull):
df = pd.read_csv('data.csv', warn_bad_lines=True, error_bad_lines=False)
I can't help you with older than Python 3, but I've had very good success with the following:
import pandas as pd
from contextlib import redirect_stderr
import io
# Redirect stderr to something we can report on.
f = io.StringIO()
with redirect_stderr(f):
df = pd.read_csv(
new_file_name, header=None, error_bad_lines=False, warn_bad_lines=True, dtype=header_types
)
if f.getvalue():
logger.warning("Had parsing errors: {}".format(f.getvalue()))
I searched for this issue a number of times and kept being pointed to this questions. Hope it helps someone else, later on.
Related
I'm trying to write an AWS Lambda service using Python 2.7 that will generate an In-Memory CSV file and email it as an attachment. I feel like I'm close with this script based on what I've learned but I'm not quite there.
# Import smtplib for the actual sending function
import smtplib
import sys
import csv
import cStringIO
from os.path import basename
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
# Import the email modules we'll need
server = smtplib.SMTP('smtp.postmarkapp.com', 587)
server.starttls()
server.login('.....','.....')
list = []
row1 = ["One","Two","Three"]
list.append(row1)
msg = MIMEMultipart()
msg['To'] = "daniel#mydomain.com"
msg['From'] = "noreply#mydomain.com"
msg['Subject'] = "DG Test subject"
msg.attach(MIMEText("Test Message"))
csv_buffer = cStringIO.StringIO()
writer = csv.writer(csv_buffer, lineterminator='\n')
writer.writerow(["1","2","3"])
for row in list:
writer.writerow(row)
print(csv_buffer.getvalue())
msg.attach(csv_buffer)
try:
response = server.sendmail(msg['From'], ["daniel#mydomain.com"],msg.as_string())
server.quit()
except AttributeError as error:
print(error)
else:
print(response)
This gives me the following error:
1,2,3
One,Two,Three
'cStringIO.StringO' object has no attribute 'get_content_maintype'
Basically it comes down to not being sure how to use the csv_buffer object. Assuming I just need to add that attribute to the object somehow but I'm not quite sure how. If I try to add any additional arguments to the .attach() line, it complains that I have too many arguments.
Thanks!
I figured it out, thanks to stitching together a few SO posts.
import cStringIO
import csv
csv_buffer = cStringIO.StringIO()
writer = csv.writer(csv_buffer, delimiter=',', quoting=csv.QUOTE_ALL)
writer.writerow(["1","2","3"])
for row in list:
writer.writerow(row)
print(csv_buffer.getvalue())
# new lines
csv_file = MIMEText(csv_buffer.getvalue())
attachment = csv_file.add_header('Content-Disposition', 'attachment', filename="csv_file.csv")
msg.attach(csv_file)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import random
from io import open
from argparse import ArgumentParser, FileType, ArgumentDefaultsHelpFormatter
from collections import Counter
from concurrent.futures import ProcessPoolExecutor
import logging
from deepwalk import graph
from deepwalk import walks as serialized_walks
from walks import WalksCorpus
from gensim.models import Word2Vec
from deepwalk.skipgram import Skipgram
from six import text_type as unicode
from six import iteritems
from six.moves import range
import psutil
from multiprocessing import cpu_count
p = psutil.Process(os.getpid())
try:
p.set_cpu_affinity(list(range(cpu_count())))
except AttributeError:
try:
p.cpu_affinity(list(range(cpu_count())))
except AttributeError:
pass
logger = logging.getLogger(__name__)
LOGFORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"
def debug(type_, value, tb):
if hasattr(sys, 'ps1') or not sys.stderr.isatty():
sys.__excepthook__(type_, value, tb)
else:
import traceback
import pdb
traceback.print_exception(type_, value, tb)
print(u"\n")
pdb.pm()
def process(args):
if args.format == "adjlist":
G = graph.load_adjacencylist(args.input, undirected=args.undirected)
elif args.format == "edgelist":
G = graph.load_edgelist(args.input, undirected=args.undirected)
elif args.format == "mat":
G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
else:
raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)
print("Number of nodes: {}".format(len(G.nodes())))
num_walks = len(G.nodes()) * args.number_walks
print("Number of walks: {}".format(num_walks))
data_size = num_walks * args.walk_length
print("Data size (walks*length): {}".format(data_size))
if data_size < args.max_memory_data_size:
print("Walking...")
walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
print("Training...")
model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
else:
print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size))
print("Walking...")
walks_filebase = args.output + ".walks"
walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
num_workers=args.workers)
print("Counting vertex frequency...")
if not args.vertex_freq_degree:
vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
else:
# use degree distribution for frequency in tree
vertex_counts = G.degree(nodes=G.iterkeys())
print("Training...")
walks_corpus = serialized_walks.WalksCorpus(walk_files)
model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
size=args.representation_size,
window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
model.wv.save_word2vec_format(args.output)
def main():
parser = ArgumentParser("deepwalk",
formatter_class=ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument("--debug", dest="debug", action='store_true', default=False,
help="drop a debugger if an exception is raised.")
parser.add_argument('--format', default='adjlist',
help='File format of input file')
parser.add_argument('--input', nargs='?', required=True,
help='Input graph file')
parser.add_argument("-l", "--log", dest="log", default="INFO",
help="log verbosity level")
parser.add_argument('--matfile-variable-name', default='network',
help='variable name of adjacency matrix inside a .mat file.')
parser.add_argument('--max-memory-data-size', default=1000000000, type=int,
help='Size to start dumping walks to disk, instead of keeping them in memory.')
parser.add_argument('--number-walks', default=10, type=int,
help='Number of random walks to start at each node')
parser.add_argument('--output', required=True,
help='Output representation file')
parser.add_argument('--representation-size', default=64, type=int,
help='Number of latent dimensions to learn for each node.')
parser.add_argument('--seed', default=0, type=int,
help='Seed for random walk generator.')
parser.add_argument('--undirected', default=True, type=bool,
help='Treat graph as undirected.')
parser.add_argument('--vertex-freq-degree', default=False, action='store_true',
help='Use vertex degree to estimate the frequency of nodes '
'in the random walks. This option is faster than '
'calculating the vocabulary.')
parser.add_argument('--walk-length', default=40, type=int,
help='Length of the random walk started at each node')
parser.add_argument('--window-size', default=5, type=int,
help='Window size of skipgram model.')
parser.add_argument('--workers', default=1, type=int,
help='Number of parallel processes.')
args = parser.parse_args()
numeric_level = getattr(logging, args.log.upper(), None)
logging.basicConfig(format=LOGFORMAT)
logger.setLevel(numeric_level)
if args.debug:
sys.excepthook = debug
process(args)
if __name__ == "__main__":
sys.exit(main())
Error:
Traceback (most recent call last): File "main.py", line 165, in sys.exit(main()) File "main.py", line 162, in main process(args) File "main.py", line 93, in process walks_corpus = serialized_walks.WalksCorpus(walk_files) AttributeError: 'module' object has no attribute 'WalksCorpus'
Why do I get this error?
It looks as though you are importing WalksCorpus on its own from walks with from walks import WalksCorpus. Then when you try to use WalksCorpus method you are looking for it with in serialized_walks which I assume does not have the WalksCorpus method in it.
Try changing this line.
walks_corpus = serialized_walks.WalksCorpus(walk_files)
To:
walks_corpus = WalksCorpus(walk_files)
i'm scraping urls from a txt file and export it to a csv file. But after all the process my code writes only the information from the last url. My guess is that i'm forgetting a loop. But where?
Here's my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
records = []
for pagetxt in final:
print pagetxt.text
records.append((pagetxt.text))
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')
Thanks
When you get data from file you keep only last value in variable final. Try to append data earlier (I've marked changes with #####):
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
file = open('urls.txt', 'r')
filelines = (line.strip() for line in file)
records = [] ######
for code in filelines:
site = urlopen(code)
soup = BeautifulSoup(site, "html.parser")
final = soup.find_all("span", {"class": "bd js-title-main-info"})
print final
for pagetxt in final: ######
print pagetxt.text ######
records.append((pagetxt.text)) ######
df = pd.DataFrame(records, columns=['product name'])
df.to_csv('test.csv', index=False, encoding='utf-8')
Hello i'm trying to learn programing for a project.
i've been working on a simple script (using tweepy) to download tweets from a search of a keyword into a .csv format. However i keep getting a sintax error in multiple lines (from 28 to 38) and i don't know what is wrong at this point, can somebody tell me what's wrong?
here is the code i've been working on...
# -*- coding: utf-8 -*-
#import modules
import tweepy
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import unicodecsv
from unidecode import unidecode
import csv
from textblob import TextBlob
ckey = "XXXXXXXXXXXXXXXXXX"
csecret = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
atoken = "XXXXXXXXXXXX-XXXXXXXXXXXXXXXXXXX"
asecret = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
OAUTH_KEYS = {'consumer_key':ckey, 'consumer_secret':csecret, 'access_token_key':atoken, 'access_token_secret':asecret}
auth = tweepy.OAuthHandler(OAUTH_KEYS['consumer_key'], OAUTH_KEYS['consumer_secret'])
api = tweepy.API(auth)
fName= raw_input("Nombre del Archivo: ")+'.csv'
for tweet in tweepy.Cursor(api.search, q=('dulceveneno'), since='2014-09-16', until='2017-07-25').items(5):
tweet_info = [tweet.author.name.encode('utf8')
tweet.author.screen_name.encode('utf8')
tweet.created_at
tweet.text.encode('utf8')
tweet.retweeted
tweet.favorited
tweet.user.location.encode('utf8')
tweet.user.time_zone
tweet.geo
tweet.entities.get('hashtags')]
with open(fName, 'wb') as file:
writer = unicodecsv.writer(file, delimiter = ',', quotechar = '"')
# Write header row.
writer.writerow(["Nombre",
"UserName",
"Fecha",
"Tweet",
"Retweet?"
"Favs"
"UbicaciĆ³n",
"Horario",
"Geo",
"Hashtags"])
# Write data to CSV.
writer.writerow(tweet_info)
# Show progress.
print("DONE!" % q)
the problem is in the "tweet_info" part i guess...
You haven't posted the actual error yet, but I can see for tweet_info you do not have any commas , for the elements in the list.
It should be:
tweet_info = [tweet.author.name.encode('utf8'),
tweet.author.screen_name.encode('utf8'),
tweet.created_at,
tweet.text.encode('utf8'),
tweet.retweeted,
tweet.favorited,
tweet.user.location.encode('utf8'),
tweet.user.time_zone,
tweet.geo,
tweet.entities.get('hashtags')]
Is there anything in python that can replicate the functionality of freopen() in C or C++? To be precise, I want to replicate the functionality of:
freopen("input.txt","r",stdin);
and
freopen("output.txt","w",stdout);
And then use the same (standard) functions for console I/O for file I/O. Any ideas?
sys.stdout is simply file object, so, you can reopen it to another destination
out = sys.stdout
sys.stdout = open('output.txt', 'w')
// do some work
sys.stdout = out
out is only for recovering sys.stdout destination to default after work (as suggested Martijn Pieters - you can recover it by using sys.__stdout__, or not recover at all, if you don't need it).
Try this:
import sys
sys.stdin = open('input.txt', 'r')
sys.stdout = open('output.txt', 'w')
Text files are self explanatory.
You can now run this code on Sublime Text or any other text editor.
If you're working on *nix platform, you can write your own freopen.
def freopen(f,option,stream):
import os
oldf = open(f,option)
oldfd = oldf.fileno()
newfd = stream.fileno()
os.close(newfd)
os.dup2(oldfd, newfd)
import sys
freopen("hello","w",sys.stdout)
print "world"
You may also want to look at the contextmanager decorator in contextlib for temporary redirection:
from contextlib import contextmanager
import sys
#contextmanager
def stdout_redirected(new_stdout):
save_stdout = sys.stdout
sys.stdout = new_stdout
try:
yield
finally:
sys.stdout = save_stdout
Example:
with open(filename, "w") as f:
with stdout_redirected(f):
print "Hello"
This should help:
import sys
def freopen(filename, mode):
if mode == "r":
sys.stdin = open(filename, mode)
elif mode == "w":
sys.stdout = open(filename, mode)
# ---- MAIN ----
freopen("input.txt", "r")
freopen("output.txt", "w")