I'm trying to merge two dataframes using the pandas merge code below. Each dataframe has just three columns. I've done similar merges before without issue. I've provided .info() on each dataframe. I'm getting an error about iterable vs not itertools.imap. I have no clue what they're talking about. Any tips very much appreciated.
Data:
pio_smp2_sm.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 12779 entries, 15 to 68311
Data columns (total 3 columns):
entityId 12779 non-null object
targetEntityId 12779 non-null object
eventTime 12779 non-null object
dtypes: object(3)
memory usage: 399.3+ KB
cm_smp2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28035 entries, 40 to 698858
Data columns (total 3 columns):
user_id 28035 non-null object
product_id 28035 non-null object
time_stamp 28035 non-null object
dtypes: object(3)
memory usage: 876.1+ KB
Code:
comp_df2=pd.merge(pio_smp2_sm,cm_smp2,how='inner',left_on=['entityId','targetEntityId'],right_on=['user_id','product_id'])
Error:
TypeErrorTraceback (most recent call last)
<ipython-input-235-6882a22fe6a1> in <module>()
23
24
---> 25 comp_df2=pd.merge(pio_smp2_sm,cm_smp2,how='inner',left_on=['entityId','targetEntityId'],right_on=['user_id','product_id'])
26
27 # print(comp_df2.shape[0])
/data2/user/anaconda2/lib/python2.7/site-packages/pandas/core/reshape/merge.pyc in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
56 copy=copy, indicator=indicator,
57 validate=validate)
---> 58 return op.get_result()
59
60
/data2/user/anaconda2/lib/python2.7/site-packages/pandas/core/reshape/merge.pyc in get_result(self)
580 self.left, self.right)
581
--> 582 join_index, left_indexer, right_indexer = self._get_join_info()
583
584 ldata, rdata = self.left._data, self.right._data
/data2/user/anaconda2/lib/python2.7/site-packages/pandas/core/reshape/merge.pyc in _get_join_info(self)
746 else:
747 (left_indexer,
--> 748 right_indexer) = self._get_join_indexers()
749
750 if self.right_index:
/data2/user/anaconda2/lib/python2.7/site-packages/pandas/core/reshape/merge.pyc in _get_join_indexers(self)
725 self.right_join_keys,
726 sort=self.sort,
--> 727 how=self.how)
728
729 def _get_join_info(self):
/data2/user/anaconda2/lib/python2.7/site-packages/pandas/core/reshape/merge.pyc in _get_join_indexers(left_keys, right_keys, sort, how, **kwargs)
1048
1049 # get left & right join labels and num. of levels at each location
-> 1050 llab, rlab, shape = map(list, zip(* map(fkeys, left_keys, right_keys)))
1051
1052 # get flat i8 keys from label lists
TypeError: type object argument after * must be an iterable, not itertools.imap
Related
I have the following data frame:
id my_year my_month waiting_time target
001 2018 1 95 1
002 2018 1 3 3
003 2018 1 4 0
004 2018 1 40 1
005 2018 2 97 1
006 2018 2 3 3
007 2018 3 4 0
008 2018 3 40 1
I want to groupby my_year and my_month, then in each group I want to compute the my_rate based on
(# of records with waiting_time <= 90 and target = 1)/ total_records in the group
i.e. I am expecting output like:
my_year my_month my_rate
2018 1 0.25
2018 2 0.0
2018 3 0.5
I wrote the following code to compute the desired value my_rate:
def my_rate(data):
waiting_time_list = data['waiting_time']
target_list = data['target']
total = len(data)
my_count = 0
for i in range(len(data)):
if total_waiting_time_list[i] <= 90 and target_list[i] == 1:
my_count += 1
rate = float(my_count)/float(total)
return rate
df.groupby(['my_year','my_month']).apply(my_rate)
However, I got the following error:
KeyError 0
KeyErrorTraceback (most recent call last)
<ipython-input-29-5c4399cefd05> in <module>()
17
---> 18 df.groupby(['my_year','my_month']).apply(my_rate)
/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, func, *args, **kwargs)
714 # ignore SettingWithCopy here in case the user mutates
715 with option_context('mode.chained_assignment', None):
--> 716 return self._python_apply_general(f)
717
718 def _python_apply_general(self, f):
/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/groupby.pyc in _python_apply_general(self, f)
718 def _python_apply_general(self, f):
719 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 720 self.axis)
721
722 return self._wrap_applied_output(
/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/groupby.pyc in apply(self, f, data, axis)
1727 # group might be modified
1728 group_axes = _get_axes(group)
-> 1729 res = f(group)
1730 if not _is_indexed_like(res, group_axes):
1731 mutated = True
<ipython-input-29-5c4399cefd05> in conversion_rate(data)
8 #print total_waiting_time_list[i], target_list[i]
9 #print i, total_waiting_time_list[i], target_list[i]
---> 10 if total_waiting_time_list[i] <= 90:# and target_list[i] == 1:
11 convert_90_count += 1
12 #print 'convert ', convert_90_count
/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
/opt/conda/envs/python2/lib/python2.7/site-packages/pandas/core/indexes/base.pyc in get_value(self, series, key)
2426 try:
2427 return self._engine.get_value(s, k,
-> 2428 tz=getattr(series.dtype, 'tz', None))
2429 except KeyError as e1:
2430 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4363)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4046)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5085)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:13913)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:13857)()
KeyError: 0
Any idea what I did wrong here? And how do I fix it? Thanks!
I believe better is use mean of boolean mask per groups:
def my_rate(x):
return ((x['waiting_time'] <= 90) & (x['target'] == 1)).mean()
df = df.groupby(['my_year','my_month']).apply(my_rate).reset_index(name='my_rate')
print (df)
my_year my_month my_rate
0 2018 1 0.25
1 2018 2 0.00
2 2018 3 0.50
Any idea what I did wrong here?
Problem is waiting_time_list and target_list are not lists, but Series:
waiting_time_list = data['waiting_time']
target_list = data['target']
print (type(waiting_time_list))
<class 'pandas.core.series.Series'>
print (type(target_list))
<class 'pandas.core.series.Series'>
So if want indexing it failed, because in second group are indices 4,5, not 0,1.
if waiting_time_list[i] <= 90 and target_list[i] == 1:
For avoid it is possible convert Series to list:
waiting_time_list = data['waiting_time'].tolist()
target_list = data['target'].tolist()
Versions: Python 2.7.13 and TF 1.2.1
Background: I'm trying to create a single LSTM cell and pass an input of N x M and output N x M+1. I want to pass the output through a softmax layer and then through an Adam optimizer with a loss function of negative log likelihood.
Problem: As stated in the title, when I try to set my training_op = optimizer.minimize(nll) it crashes and asks about a variable scope. What should I do?
Code:
with tf.variable_scope('lstm1', reuse=True):
LSTM_cell_1 = tf.nn.rnn_cell.LSTMCell(num_units=n_neurons, activation=tf.nn.relu)
rnn_outputs_1, states_1 = tf.nn.dynamic_rnn(LSTM_cell_1, X_1, dtype=tf.float32)
rnn_outputs_1 = tf.nn.softmax(rnn_outputs_1)
stacked_rnn_outputs_1 = tf.reshape(rnn_outputs_1, [-1, n_neurons])
stacked_outputs_1 = tf.layers.dense(stacked_rnn_outputs_1, n_outputs)
outputs_1 = tf.reshape(stacked_outputs_1, [-1, n_steps, n_outputs])
mu = tf.Variable(np.float32(1))
sigma = tf.Variable(np.float32(1))
def normal_log(X, mu, sigma, left=-np.inf, right=np.inf):
val = -tf.log(tf.constant(np.sqrt(2.0 * np.pi), dtype=tf.float32) * sigma) - \
tf.pow(X - mu, 2) / (tf.constant(2.0, dtype=tf.float32) * tf.pow(sigma, 2))
return val
nll = -tf.reduce_sum(normal_log(outputs, mu, sigma))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(nll)
Error message:
ValueError Traceback (most recent call last)
/usr/local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.pyc in minimize(self, loss, global_step, var_list, gate_gradients, aggregation_method, colocate_gradients_with_ops, name, grad_loss)
323
324 return self.apply_gradients(grads_and_vars, global_step=global_step,
--> 325 name=name)
326
327 def compute_gradients(self, loss, var_list=None,
/usr/local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.pyc in apply_gradients(self, grads_and_vars, global_step, name)
444 ([str(v) for _, _, v in converted_grads_and_vars],))
445 with ops.control_dependencies(None):
--> 446 self._create_slots([_get_variable_for(v) for v in var_list])
447 update_ops = []
448 with ops.name_scope(name, self._name) as name:
/usr/local/lib/python2.7/site-packages/tensorflow/python/training/adam.pyc in _create_slots(self, var_list)
126 # Create slots for the first and second moments.
127 for v in var_list:
--> 128 self._zeros_slot(v, "m", self._name)
129 self._zeros_slot(v, "v", self._name)
130
/usr/local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.pyc in _zeros_slot(self, var, slot_name, op_name)
764 named_slots = self._slot_dict(slot_name)
765 if _var_key(var) not in named_slots:
--> 766 named_slots[_var_key(var)] = slot_creator.create_zeros_slot(var, op_name)
767 return named_slots[_var_key(var)]
/usr/local/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.pyc in create_zeros_slot(primary, name, dtype, colocate_with_primary)
172 return create_slot_with_initializer(
173 primary, initializer, slot_shape, dtype, name,
--> 174 colocate_with_primary=colocate_with_primary)
175 else:
176 val = array_ops.zeros(slot_shape, dtype=dtype)
/usr/local/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.pyc in create_slot_with_initializer(primary, initializer, shape, dtype, name, colocate_with_primary)
144 with ops.colocate_with(primary):
145 return _create_slot_var(primary, initializer, "", validate_shape, shape,
--> 146 dtype)
147 else:
148 return _create_slot_var(primary, initializer, "", validate_shape, shape,
/usr/local/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.pyc in _create_slot_var(primary, val, scope, validate_shape, shape, dtype)
64 use_resource=_is_resource(primary),
65 shape=shape, dtype=dtype,
---> 66 validate_shape=validate_shape)
67 variable_scope.get_variable_scope().set_partitioner(current_partitioner)
68
/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.pyc in get_variable(self, var_store, name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource, custom_getter)
960 collections=collections, caching_device=caching_device,
961 partitioner=partitioner, validate_shape=validate_shape,
--> 962 use_resource=use_resource, custom_getter=custom_getter)
963
964 def _get_partitioned_variable(self,
/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.pyc in get_variable(self, name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource, custom_getter)
365 reuse=reuse, trainable=trainable, collections=collections,
366 caching_device=caching_device, partitioner=partitioner,
--> 367 validate_shape=validate_shape, use_resource=use_resource)
368
369 def _get_partitioned_variable(
/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.pyc in _true_getter(name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource)
350 trainable=trainable, collections=collections,
351 caching_device=caching_device, validate_shape=validate_shape,
--> 352 use_resource=use_resource)
353
354 if custom_getter is not None:
/usr/local/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.pyc in _get_single_variable(self, name, shape, dtype, initializer, regularizer, partition_info, reuse, trainable, collections, caching_device, validate_shape, use_resource)
662 " Did you mean to set reuse=True in VarScope? "
663 "Originally defined at:\n\n%s" % (
--> 664 name, "".join(traceback.format_list(tb))))
665 found_var = self._vars[name]
666 if not shape.is_compatible_with(found_var.get_shape()):
ValueError: Variable lstm1/dense/kernel/Adam_1/ already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:
File "<ipython-input-107-eed033b85dc0>", line 11, in <module>
training_op = optimizer.minimize(nll)
File "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2882, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
if self.run_code(code, result):
So turns out I was executing the section over and over again inside a Python notebook, so to all tf rookies out there, remember to reset your kernel every time
I have the following Data Frame named: mydf:
A B
0 3de (1ABS) Adiran
1 3SA (SDAS) Adel
2 7A (ASA) Ronni
3 820 (SAAa) Emili
I want to remove the " (xxxx)" and keeps the values in column A , so the dataframe (mydf) will look like:
A B
0 3de Adiran
1 3SA Adel
2 7A Ronni
3 820 Emili
I have tried :
print mydf['A'].apply(lambda x: re.sub(r" \(.+\)", "", x) )
but then I get a Series object back and not a dataframe object.
I have also tried to use replace:
df.replace([' \(.*\)'],[""], regex=True), But it didn't change anything.
What am I doing wrong?
Thank you!
you can use str.split() method:
In [3]: df.A = df.A.str.split('\s+\(').str[0]
In [4]: df
Out[4]:
A B
0 3de Adiran
1 3SA Adel
2 7A Ronni
3 820 Emili
or using str.extract() method:
In [9]: df.A = df.A.str.extract(r'([^\(\s]*)', expand=False)
In [10]: df
Out[10]:
A B
0 3de Adiran
1 3SA Adel
2 7A Ronni
3 820 Emili
I'm trying to create a logistic regression model in tensorflow.
When I try to execute model.fit(input_fn=train_input_fn, steps=200) I get the following error.
TypeError Traceback (most recent call last)
<ipython-input-44-fd050d8188b5> in <module>()
----> 1 model.fit(input_fn=train_input_fn, steps=200)
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.pyc in fit(self, x, y, input_fn, steps, batch_size, monitors)
180 feed_fn=feed_fn,
181 steps=steps,
--> 182 monitors=monitors)
183 logging.info('Loss for final step: %s.', loss)
184 return self
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/estimator.pyc in _train_model(self, input_fn, steps, feed_fn, init_op, init_feed_fn, init_fn, device_fn, monitors, log_every_steps, fail_on_nan_loss)
447 features, targets = input_fn()
448 self._check_inputs(features, targets)
--> 449 train_op, loss_op = self._get_train_ops(features, targets)
450
451 # Add default monitors.
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/linear.pyc in _get_train_ops(self, features, targets)
105 if self._linear_feature_columns is None:
106 self._linear_feature_columns = layers.infer_real_valued_columns(features)
--> 107 return super(LinearClassifier, self)._get_train_ops(features, targets)
108
109 #property
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.pyc in _get_train_ops(self, features, targets)
154 global_step = contrib_variables.get_global_step()
155 assert global_step
--> 156 logits = self._logits(features, is_training=True)
157 with ops.control_dependencies([self._centered_bias_step(
158 targets, self._get_weight_tensor(features))]):
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.pyc in _logits(self, features, is_training)
298 logits = self._dnn_logits(features, is_training=is_training)
299 else:
--> 300 logits = self._linear_logits(features)
301
302 return nn.bias_add(logits, self._centered_bias())
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.pyc in _linear_logits(self, features)
255 num_outputs=self._num_label_columns(),
256 weight_collections=[self._linear_weight_collection],
--> 257 name="linear")
258 return logits
259
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/layers/python/layers/feature_column_ops.pyc in weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections, name, trainable)
173 transformer = _Transformer(columns_to_tensors)
174 for column in sorted(set(feature_columns), key=lambda x: x.key):
--> 175 transformed_tensor = transformer.transform(column)
176 predictions, variable = column.to_weighted_sum(transformed_tensor,
177 num_outputs,
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/layers/python/layers/feature_column_ops.pyc in transform(self, feature_column)
353 return self._columns_to_tensors[feature_column]
354
--> 355 feature_column.insert_transformed_feature(self._columns_to_tensors)
356
357 if feature_column not in self._columns_to_tensors:
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/layers/python/layers/feature_column.pyc in insert_transformed_feature(self, columns_to_tensors)
410 mapping=list(self.lookup_config.keys),
411 default_value=self.lookup_config.default_value,
--> 412 name=self.name + "_lookup")
413
414
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/contrib/lookup/lookup_ops.pyc in string_to_index(tensor, mapping, default_value, name)
349 with ops.op_scope([tensor], name, "string_to_index") as scope:
350 shared_name = ""
--> 351 keys = ops.convert_to_tensor(mapping, dtypes.string)
352 vocab_size = array_ops.size(keys)
353 values = math_ops.cast(math_ops.range(vocab_size), dtypes.int64)
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/ops.pyc in convert_to_tensor(value, dtype, name, as_ref)
618 for base_type, conversion_func in funcs_at_priority:
619 if isinstance(value, base_type):
--> 620 ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
621 if ret is NotImplemented:
622 continue
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/constant_op.pyc in _constant_tensor_conversion_function(v, dtype, name, as_ref)
177 as_ref=False):
178 _ = as_ref
--> 179 return constant(v, dtype=dtype, name=name)
180
181
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/python/ops/constant_op.pyc in constant(value, dtype, shape, name)
160 tensor_value = attr_value_pb2.AttrValue()
161 tensor_value.tensor.CopyFrom(
--> 162 tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape))
163 dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
164 const_tensor = g.create_op(
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/tensor_util.pyc in make_tensor_proto(values, dtype, shape)
351 nparray = np.empty(shape, dtype=np_dt)
352 else:
--> 353 _AssertCompatible(values, dtype)
354 nparray = np.array(values, dtype=np_dt)
355 # check to them.
/home/praveen/anaconda/lib/python2.7/site-packages/tensorflow/python/framework/tensor_util.pyc in _AssertCompatible(values, dtype)
288 else:
289 raise TypeError("Expected %s, got %s of type '%s' instead." %
--> 290 (dtype.name, repr(mismatch), type(mismatch).__name__))
291
292
TypeError: Expected string, got 1 of type 'int64' instead.
I'm not sure which feature to check. Could somebody tell me how could debug this please? Thanks in advance
I had few categorical columns features whose data types are int64. So, I converted the columns from int to string. After that the fit step ran to completion. Apparently, tensorflow expects the categorical features dtype to be string.
I have a input file like this:
j,z,b,bsy,afj,upz,343,13,ruhwd
u,i,a,dvp,ibt,dxv,154,00,adsif
t,a,a,jqj,dtd,yxq,540,49,kxthz
j,z,b,bsy,afj,upz,343,13,ruhwd
u,i,a,dvp,ibt,dxv,154,00,adsif
t,a,a,jqj,dtd,yxq,540,49,kxthz
c,u,g,nfk,ekh,trc,085,83,xppnl
For every unique value of Column1, I need to find out the sum of column7
Similarly, for every unique value of Column2, I need to find out the sum of column7
Output for 1 should be like:
j,686
u,308
t,98
c,83
Output for 2 should be like:
z,686
i,308
a,98
u,83
I am fairly new in Python. How can I achieve the above?
This could be done using Python's Counter and csv library as follows:
from collections import Counter
import csv
c1 = Counter()
c2 = Counter()
with open('input.csv') as f_input:
for cols in csv.reader(f_input):
col7 = int(cols[6])
c1[cols[0]] += col7
c2[cols[1]] += col7
print "Column 1"
for value, count in c1.iteritems():
print '{},{}'.format(value, count)
print "\nColumn 2"
for value, count in c2.iteritems():
print '{},{}'.format(value, count)
Giving you the following output:
Column 1
c,85
j,686
u,308
t,1080
Column 2
i,308
a,1080
z,686
u,85
A Counter is a type of Python dictionary that is useful for counting items automatically. c1 holds all of the column 1 entries and c2 holds all of the column 2 entries. Note, Python numbers lists starting from 0, so the first entry in a list is [0].
The csv library loads each line of the file into a list, with each entry in the list representing a different column. The code takes column 7 (i.e. cols[6]) and converts it into an integer, as all columns are held as strings. It is then added to the counter using either the column 1 or 2 value as the key. The result is two dictionaries holding the totaled counts for each key.
You can use pandas:
df = pd.read_csv('my_file.csv', header=None)
print(df.groupby(0)[6].sum())
print(df.groupby(1)[6].sum())
Output:
0
c 85
j 686
t 1080
u 308
Name: 6, dtype: int64
1
a 1080
i 308
u 85
z 686
Name: 6, dtype: int64
The data frame should look like this:
print(df.head())
Output:
0 1 2 3 4 5 6 7 8
0 j z b bsy afj upz 343 13 ruhwd
1 u i a dvp ibt dxv 154 0 adsif
2 t a a jqj dtd yxq 540 49 kxthz
3 j z b bsy afj upz 343 13 ruhwd
4 u i a dvp ibt dxv 154 0 adsif
You can also use your own names for the columns. Like c1, c2, ... c9:
df = pd.read_csv('my_file.csv', index_col=False, names=['c' + str(x) for x in range(1, 10)])
print(df)
Output:
c1 c2 c3 c4 c5 c6 c7 c8 c9
0 j z b bsy afj upz 343 13 ruhwd
1 u i a dvp ibt dxv 154 0 adsif
2 t a a jqj dtd yxq 540 49 kxthz
3 j z b bsy afj upz 343 13 ruhwd
4 u i a dvp ibt dxv 154 0 adsif
5 t a a jqj dtd yxq 540 49 kxthz
6 c u g nfk ekh trc 85 83 xppnl
Now, group by column 1 c1 or column c2 and sum up column 7 c7:
print(df.groupby(['c1'])['c7'].sum())
print(df.groupby(['c2'])['c7'].sum())
Output:
c1
c 85
j 686
t 1080
u 308
Name: c7, dtype: int64
c2
a 1080
i 308
u 85
z 686
Name: c7, dtype: int64
SO isn't supposed to be a code writing service, but I had a few minutes. :) Without Pandas you can do it with the CSV module;
import csv
def sum_to(results, key, add_value):
if key not in results:
results[key] = 0
results[key] += int(add_value)
column1_results = {}
column2_results = {}
with open("input.csv", 'rt') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
sum_to(column1_results, row[0], row[6])
sum_to(column2_results, row[1], row[6])
print column1_results
print column2_results
Results:
{'c': 85, 'j': 686, 'u': 308, 't': 1080}
{'i': 308, 'a': 1080, 'z': 686, 'u': 85}
Your expected results don't seem to match the math that Mike's answer and mine got using your spec. I'd double check that.