split a sequence by delimiter in clojure? - clojure

Say I have a sequence in clojure like
'(1 2 3 6 7 8)
and I want to split it up so that the list splits whenever an element divisible by 3 is encountered, so that the result looks like
'((1 2) (3) (6 7 8))
(EDIT: What I actually need is
[[1 2] [3] [6 7 8]]
, but I'll take the sequence version too : )
What is the best way to do this in clojure?
partition-by is no help:
(partition-by #(= (rem % 3) 0) '(1 2 3 6 7 8))
; => ((1 2) (3 6) (7 8))
split-with is close:
(split-with #(not (= (rem % 3) 0)) '(1 2 3 6 7 8))
; => [(1 2) (3 6 7 8)]

Something like this?
(defn partition-with
[f coll]
(lazy-seq
(when-let [s (seq coll)]
(let [run (cons (first s) (take-while (complement f) (next s)))]
(cons run (partition-with f (seq (drop (count run) s))))))))
(partition-with #(= (rem % 3) 0) [1 2 3 6 7 8 9 12 13 15 16 17 18])
=> ((1 2) (3) (6 7 8) (9) (12 13) (15 16 17) (18))

This is an interesting problem. I recently added a function split-using to the Tupelo library, which seems like a good fit here. I left the spyx debug statements in the code below so you can see how things progress:
(ns tst.clj.core
(:use clojure.test tupelo.test)
(:require
[tupelo.core :as t] ))
(t/refer-tupelo)
(defn start-segment? [vals]
(zero? (rem (first vals) 3)))
(defn partition-using [pred vals-in]
(loop [vals vals-in
result []]
(if (empty? vals)
result
(t/spy-let [
out-first (take 1 vals)
[out-rest unprocessed] (split-using pred (spyx (next vals)))
out-vals (glue out-first out-rest)
new-result (append result out-vals)]
(recur unprocessed new-result)))))
Which gives us output like:
out-first => (1)
(next vals) => (2 3 6 7 8)
[out-rest unprocessed] => [[2] (3 6 7 8)]
out-vals => [1 2]
new-result => [[1 2]]
out-first => (3)
(next vals) => (6 7 8)
[out-rest unprocessed] => [[] [6 7 8]]
out-vals => [3]
new-result => [[1 2] [3]]
out-first => (6)
(next vals) => (7 8)
[out-rest unprocessed] => [[7 8] ()]
out-vals => [6 7 8]
new-result => [[1 2] [3] [6 7 8]]
(partition-using start-segment? [1 2 3 6 7 8]) => [[1 2] [3] [6 7 8]]
or for a larger input vector:
(partition-using start-segment? [1 2 3 6 7 8 9 12 13 15 16 17 18 18 18 3 4 5])
=> [[1 2] [3] [6 7 8] [9] [12 13] [15 16 17] [18] [18] [18] [3 4 5]]
You could also create a solution using nested loop/recur, but that is already coded up in the split-using function:
(defn split-using
"Splits a collection based on a predicate with a collection argument.
Finds the first index N such that (pred (drop N coll)) is true. Returns a length-2 vector
of [ (take N coll) (drop N coll) ]. If pred is never satisified, [ coll [] ] is returned."
[pred coll]
(loop [left []
right (vec coll)]
(if (or (empty? right) ; don't call pred if no more data
(pred right))
[left right]
(recur (append left (first right))
(rest right)))))
Actually, the above function seems like it would be useful in the future. partition-using has now been added to the Tupelo library.

and one more old school reduce-based solution:
user> (defn split-all [pred items]
(when (seq items)
(apply conj (reduce (fn [[acc curr] x]
(if (pred x)
[(conj acc curr) [x]]
[acc (conj curr x)]))
[[] []] items))))
#'user/split-all
user> (split-all #(zero? (rem % 3)) '(1 2 3 6 7 8 10 11 12))
;;=> [[1 2] [3] [6 7 8 10 11] [12]]

Related

Clojure; select all nth element from list of lists with unequal size, for n = 1, 2,

I'd like to have a function, such that,
(f '([1 4 7] [2 5 9] [3 6]))
would give
([1 2 3] [4 5 6] [7 9])
I tried
(apply map vector '([1 4 7] [2 5 9] [3 6]))
would only produce:
([1 2 3] [4 5 6])
I find it hard to describe my requirements that it's difficult for me to search for a ready solution.
Please help me either to improve my description, or pointer to a solution.
Thanks in advance!
I'd solve a more general problem which means you might reuse that function in the future. I'd change map so that it keeps going past the smallest map.
(defn map-all
"Like map but if given multiple collections will call the function f
with as many arguments as there are elements still left."
([f] (map f))
([f coll] (map f coll))
([f c1 & colls]
(let [step (fn step [cs]
(lazy-seq
(let [ss (keep seq cs)]
(when (seq ss)
(cons (map first ss)
(step (map rest ss)))))))]
(map #(apply f %) (step (conj colls c1))))))
(apply map-all vector '([1 4 7] [2 5 9] [3 6]))
(apply map-all vector '([1 false 7] [nil 5 9] [3 6] [8]))
Note, that as opposed to many other solutions, this one works fine even if any of the sequences contain nil or false.
or this way with loop/recur:
user> (defn transpose-all-2 [colls]
(loop [colls colls res []]
(if-let [colls (seq (filter seq colls))]
(recur (doall (map next colls))
(conj res (mapv first colls)))
res)))
#'user/transpose-all-2
user> (transpose-all-2 x)
[[1 2 3] [4 5 6] [7 9]]
user> (transpose-all-2 '((0 1 2 3) (4 5 6 7) (8 9)))
[[0 4 8] [1 5 9] [2 6] [3 7]]
If you know the maximum length of the vectors ahead of time, you could define
(defn tx [colls]
(lazy-seq
(cons (filterv identity (map first colls))
(tx (map rest colls)))))
then
(take 3 (tx '([1 4 7] [2 5 9] [3 6])))
A simple solution is
(defn transpose-all
[colls]
(lazy-seq
(let [ss (keep seq colls)]
(when (seq ss)
(cons (map first ss) (transpose-all (map rest ss)))))))
For example,
(transpose-all '([1 4 7] [2 5 9] [3 6] [11 12 13 14]))
;((1 2 3 11) (4 5 6 12) (7 9 13) (14))
Here is my own attempt:
(defn f [l]
(let [max-count (apply max (map count l))
l-patched (map (fn [e] (if (< (count e) max-count)
(concat e (take (- max-count (count e)) (repeat nil)))
e)) l)]
(map (fn [x] (filter identity x)) (apply map vector l-patched))
))
Another simple solution:
(->> jagged-list
(map #(concat % (repeat nil)))
(apply map vector)
(take-while (partial some identity)))
A jagged-list like this
'([1 4 7 ]
[2 5 9 ]
[3 6 ]
[11 12 13 14])
will produce:
'([1 2 3 11]
[4 5 6 12]
[7 9 nil 13]
[nil nil nil 14])
Here is another go that doesn't require you to know the vector length in advance:
(defn padzip [& [colls]]
(loop [acc [] colls colls]
(if (every? empty? colls) acc
(recur (conj acc (filterv some?
(map first colls))) (map rest colls)))))

I would like to Parallelize my Clojure implementation

Ok so i have an algorithm what it does is , it loops through a fill line by line and then looks for a given word in the line. Not only does it return the given word but it also returns a number(given also as a parameter) of words that come before and after that word.
Eg.line = "I am overflowing with blessings and you also are"
parameters = ("you" 2)
output = (blessings and you also are)
(with-open [r (clojure.java.io/reader "resources/small.txt")]
(doseq [l (line-seq r)]
(let [x (topMostLoop l "good" 2)]
(if (not (empty? x))
(println x)))))
the above code is working fine. But i would like to parallelize it so i did this below
(with-open [r (clojure.java.io/reader "resources/small.txt")]
(doseq [l (line-seq r)]
(future
(let [x (topMostLoop l "good" 2)]
(if (not (empty? x))
(println x))))))
but then the outputs comes out all messy. I know I need to lock somewhere but dont know where.
(defn topMostLoop [contents word next]
(let [mywords (str/split contents #"[ ,\\.]+")]
(map (fn [element] (
return-lines (max 0 (- element next))
(min (+ element next) (- (count mywords) 1)) mywords))
(vec ((indexHashMap mywords) word)))))
Please would be glad if someone can help me this is the last thing Im left with.
NB. Do let me know if i need to post the other functions as well
I have added the other functions for more clarity
(defn return-lines [firstItem lastItem contentArray]
(take (+ (- lastItem firstItem) 1)
(map (fn [element] (str element))
(vec (drop firstItem contentArray)))))
(defn indexHashMap [mywords]
(->> (zipmap (range) mywords) ;contents is a list of words
(reduce (fn [index [location word]]
(merge-with concat index {word (list location)})) {})))
First, use map for first example when you are using serial approach:
(with-open [r (clojure.java.io/reader "resources/small.txt")]
(doseq [l (map #(topMostLoop %1 "good" 2) (line-seq r))]
(if (not (empty? l))
(println l))))
With this approach topMostLoop function is applied on each line, and lazy seq of results is returned. In body of doseq function results are printed if not empty.
After that, replace map with pmap, which will run mapping in parallel, and results will appear in same order as given lines:
(with-open [r (clojure.java.io/reader "resources/small.txt")]
(doseq [l (pmap #(topMostLoop %1 "good" 2) (line-seq r))]
(if (not (empty? l))
(println l))))
In your case with futures, results will be normaly out of order (some later futures will finish execution sooner than former futures).
I tested this with following modifications (not reading text file, but creating lazy sequence of vector of numbers, searching for value in vectors and returning surrounding):
(def lines (repeatedly #(shuffle (range 1 11))))
(def lines-10 (take 10 lines))
lines-10
([5 8 3 10 6 9 7 2 1 4]
[6 8 9 7 2 5 10 4 1 3]
[2 7 8 9 1 5 10 3 4 6]
[10 8 3 5 7 2 4 9 6 1]
[8 6 10 1 9 4 3 7 2 5]
[9 6 8 1 5 10 3 4 2 7]
[10 9 3 7 1 8 4 6 5 2]
[6 1 4 10 3 7 8 9 5 2]
[9 6 7 5 8 3 10 4 2 1]
[4 1 5 2 7 3 6 9 8 10])
(defn surrounding
[v value size]
(let [i (.indexOf v value)]
(if (= i -1)
nil
(subvec v (max (- i size) 0) (inc (min (+ i size) (dec (count v))))))))
(doseq [l (map #(surrounding % 3 2) lines-10)] (if (not (empty? l)) (println l)))
[5 8 3 10 6]
[4 1 3]
[5 10 3 4 6]
[10 8 3 5 7]
[9 4 3 7 2]
[5 10 3 4 2]
[10 9 3 7 1]
[4 10 3 7 8]
[5 8 3 10 4]
[2 7 3 6 9]
nil
(doseq [l (pmap #(surrounding % 3 2) lines-10)] (if (not (empty? l)) (println l)))
[5 8 3 10 6]
[4 1 3]
[5 10 3 4 6]
[10 8 3 5 7]
[9 4 3 7 2]
[5 10 3 4 2]
[10 9 3 7 1]
[4 10 3 7 8]
[5 8 3 10 4]
[2 7 3 6 9]
nil

clojure: partition a seq based on a seq of values

I would like to partition a seq, based on a seq of values
(partition-by-seq [3 5] [1 2 3 4 5 6])
((1 2 3)(4 5)(6))
The first input is a seq of split points.
The second input is a seq i would like to partition.
So, that the first list will be partitioned at the value 3 (1 2 3) and the second partition will be (4 5) where 5 is the next split point.
another example:
(partition-by-seq [3] [2 3 4 5])
result: ((2 3)(4 5))
(partition-by-seq [2 5] [2 3 5 6])
result: ((2)(3 5)(6))
given: the first seq (split points) is always a subset of the second input seq.
I came up with this solution which is lazy and quite (IMO) straightforward.
(defn part-seq [splitters coll]
(lazy-seq
(when-let [s (seq coll)]
(if-let [split-point (first splitters)]
; build seq until first splitter
(let [run (cons (first s) (take-while #(<= % split-point) (next s)))]
; build the lazy seq of partitions recursively
(cons run
(part-seq (rest splitters) (drop (count run) s))))
; just return one partition if there is no splitter
(list coll)))))
If the split points are all in the sequence:
(part-seq [3 5 8] [0 1 2 3 4 5 6 7 8 9])
;;=> ((0 1 2 3) (4 5) (6 7 8) (9))
If some split points are not in the sequence
(part-seq [3 5 8] [0 1 2 4 5 6 8 9])
;;=> ((0 1 2) (4 5) (6 8) (9))
Example with some infinite sequences for the splitters and the sequence to split.
(take 5 (part-seq (iterate (partial + 3) 5) (range)))
;;=> ((0 1 2 3 4 5) (6 7 8) (9 10 11) (12 13 14) (15 16 17))
the sequence to be partitioned is a splittee and the elements of split-points (aka. splitter) marks the last element of a partition.
from your example:
splittee: [1 2 3 4 5 6]
splitter: [3 5]
result: ((1 2 3)(4 5)(6))
Because the resulting partitions is always a increasing integer sequence and increasing integer sequence of x can be defined as start <= x < end, the splitter elements can be transformed into end of a sequence according to the definition.
so, from [3 5], we want to find subsequences ended with 4 and 6.
then by adding the start, the splitter can be transformed into sequences of [start end]. The start and end of the splittee is also used.
so, the splitter [3 5] then becomes:
[[1 4] [4 6] [6 7]]
splitter transformation could be done like this
(->> (concat [(first splittee)]
(mapcat (juxt inc inc) splitter)
[(inc (last splittee))])
(partition 2)
there is a nice symmetry between transformed splitter and the desired result.
[[1 4] [4 6] [6 7]]
((1 2 3) (4 5) (6))
then the problem becomes how to extract subsequences inside splittee that is ranged by [start end] inside transformed splitter
clojure has subseq function that can be used to find a subsequence inside ordered sequence by start and end criteria. I can just map the subseq of splittee for each elements of transformed-splitter
(map (fn [[x y]]
(subseq (apply sorted-set splittee) <= x < y))
transformed-splitter)
by combining the steps above, my answer is:
(defn partition-by-seq
[splitter splittee]
(->> (concat [(first splittee)]
(mapcat (juxt inc inc) splitter)
[(inc (last splittee))])
(partition 2)
(map (fn [[x y]]
(subseq (apply sorted-set splittee) <= x < y)))))
This is the solution i came up with.
(def a [1 2 3 4 5 6])
(def p [2 4 5])
(defn partition-by-seq [s input]
(loop [i 0
t input
v (transient [])]
(if (< i (count s))
(let [x (split-with #(<= % (nth s i)) t)]
(recur (inc i) (first (rest x)) (conj! v (first x))))
(do
(conj! v t)
(filter #(not= (count %) 0) (persistent! v))))))
(partition-by-seq p a)

Remove n instances of matched elements from collection

What is the best way to remove n instances of matched elements of collection-2 from collection-1?
(let [coll-1 [8 2]
coll-2 [8 8 8 2]
Here's what I first came up with to solve original problem:
...
;; (remove (set coll-1) coll-2))
;; --> ()
But realised I must achieve:
...
;; (some-magic coll-1 coll-2))
;; --> (8 8)
Clarification:
(some-magic {8 2} [8 8 8 2]) ;;Removes 1x8 and 1x2 from vector.
(some-magic {8 8 2} [8 8 8 2]) ;;Removes 2x8 and 1x2 from vector.
Edit:
Preserving the order is desired.
Here is a lazy solution, written in the style of distinct:
(defn some-magic [count-map coll]
(let [step (fn step [xs count-map]
(lazy-seq
((fn [[f :as xs] count-map]
(when-let [s (seq xs)]
(if (pos? (get count-map f 0))
(recur (rest s) (update-in count-map [f] dec))
(cons f (step (rest s) count-map)))))
xs count-map)))]
(step coll count-map)))
The first argument needs to be a map indicating how many of each value to remove:
(some-magic {8 1, 2 1} [8 8 8 2]) ;; Removes 1x8 and 1x2
;=> (8 8)
(some-magic {8 2, 2 1} [8 8 8 2]) ;; Removes 2x8 and 1x2
;=> (8)
Here is an example dealing with falsey values and infinite input:
(take 10 (some-magic {3 4, 2 2, nil 1} (concat [3 nil 3 false nil 3 2] (range))))
;=> (false nil 0 1 4 5 6 7 8 9)
I don't see any of the built in sequence manipulation functions quite solving this, though a straitforward loop can build the result nicely:
user> (loop [coll-1 (set coll-1) coll-2 coll-2 result []]
(if-let [[f & r] coll-2]
(if (coll-1 f)
(recur (disj coll-1 f) r result)
(recur coll-1 r (conj result f)))
result))
[8 8]

Changing map behaviour in Clojure

I need to modify map function behavior to provide mapping not with minimum collection size but with maximum and use zero for missing elements.
Standard behavior:
(map + [1 2 3] [4 5 6 7 8]) => [5 7 9]
Needed behavior:
(map + [1 2 3] [4 5 6 7 8]) => [5 7 9 7 8]
I wrote function to do this, but it seems not very extensible with varargs.
(defn map-ext [f coll1 coll2]
(let [mx (max (count coll1) (count coll2))]
(map f
(concat coll1 (repeat (- mx (count coll1)) 0))
(concat coll2 (repeat (- mx (count coll2)) 0)))))
Is there a better way to do this?
Your method is concise, but inefficient (it calls count). A more efficient solution, which does not require the entirety of its input sequences to be stored in memory follows:
(defn map-pad [f pad & colls]
(lazy-seq
(let [seqs (map seq colls)]
(when (some identity seqs)
(cons (apply f (map #(or (first %) pad) seqs))
(apply map-pad f pad (map rest seqs)))))))
Used like this:
user=> (map-pad + 0 [] [1] [1 1] (range 1 10))
(3 3 3 4 5 6 7 8 9)
Edit: Generalized map-pad to arbitrary arity.
Another lazy variant, usable with an arbitrary number of input sequences:
(defn map-ext [f ext & seqs]
(lazy-seq
(if (some seq seqs)
(cons (apply f (map #(if (seq %) (first %) ext) seqs))
(apply map-ext f ext (map rest seqs)))
())))
Usage:
user> (map-ext + 0 [1 2 3] [4 5 6 7 8])
(5 7 9 7 8)
user> (map-ext + 0 [1 2 3] [4 5 6 7 8] [3 4])
(8 11 9 7 8)
If you just want it to work for any number of collections, try:
(defn map-ext [f & colls]
(let [mx (apply max (map count colls))]
(apply map f (map #(concat % (repeat (- mx (count %)) 0)) colls))))
Clojure> (map-ext + [1 2] [1 2 3] [1 2 3 4])
(3 6 6 4)
I suspect there may be better solutions though (as Trevor Caira suggests, this solution isn't lazy due to the calls to count).
How about that:
(defn map-ext [f x & xs]
(let [colls (cons x xs)
res (apply map f colls)
next (filter not-empty (map #(drop (count res) %) colls))]
(if (empty? next) res
(lazy-seq (concat res (apply map-ext f next))))))
user> (map-ext + [1 2 3] [4] [5 6] [7 8 9 10])
(17 16 12 10)
Along the lines of #LeNsTR's solution, but simpler and faster:
(defn map-ext [f & colls]
(lazy-seq
(let [colls (filter seq colls)
firsts (map first colls)
rests (map rest colls)]
(when (seq colls)
(cons (apply f firsts) (apply map-ext f rests))))))
(map-ext + [1 2 3] [4] [5 6] [7 8 9 10])
;(17 16 12 10)
I've just noticed Michał Marczyk's accepted solution, which is superior: it deals properly with asymmetric mapping functions such as -.
We can make Michał Marczyk's answer neater by using the convention - which many core functions follow - that you get a default or identity value by calling the function with no arguments. For examples:
(+) ;=> 0
(concat) ;=> ()
The code becomes
(defn map-ext [f & seqs]
(lazy-seq
(when (some seq seqs)
(cons (apply f (map #(if (seq %) (first %) (f)) seqs))
(apply map-ext f (map rest seqs)))
)))
(map-ext + [1 2 3] [4 5 6 7 8] [3 4])
;(8 11 9 7 8)
I've made the minimum changes. It could be speeded up a bit.
We may need a function that will inject such a default value into a function that lacks it:
(defn with-default [f default]
(fn
([] default)
([& args] (apply f args))))
((with-default + 6)) ;=> 6
((with-default + 6) 7 8) ;=> 15
This could be speeded up or even turned into a macro.