Remove duplicate values in child nodes of XML file using XSLT - xslt

I have the below xml data and I want remove the duplicate values from this xml.
<Report_Data>
<Report_Entry>
<Classifications_group>
<ClassificationGroupName Descriptor="EEO-1 Job Categories">
</ClassificationGroupName>
<ClassificationDescription>Professionals</ClassificationDescription>
</Classifications_group>
<Classifications_group>
<ClassificationGroupName Descriptor="Hartford Job Category">
</ClassificationGroupName>
<ClassificationDescription>Other</ClassificationDescription>
</Classifications_group>
<Classifications_group>
<ClassificationGroupName Descriptor="Hartford Job Category">
</ClassificationGroupName>
<ClassificationDescription>Other</ClassificationDescription>
</Classifications_group>
</Report_Entry>
<Report_Entry>
<Classifications_group>
<ClassificationGroupName Descriptor="EEO-1 Job Categories">
</ClassificationGroupName>
<ClassificationDescription>Administrative Support Workers</ClassificationDescription>
</Classifications_group>
<Classifications_group>
<ClassificationGroupName Descriptor="Hartford Job Category">
</ClassificationGroupName>
<ClassificationDescription>Other</ClassificationDescription>
</Classifications_group>
</Report_Entry>
</Report_Data>
I have used the following XSLT to remove duplicate values.
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
exclude-result-prefixes="xs" version="2.0">
<xsl:variable name="CRLF" select="'
'"/>
<xsl:output indent="no" method="text"/>
<xsl:strip-space elements="*"/>
<xsl:template match="Report_Data">
<xsl:value-of select="'ClassificationGroupName,ClassificationDescription'"/>
<xsl:value-of select="$CRLF"/>
<xsl:for-each select="Report_Entry">
<xsl:for-each-group select="Classifications_group" group-by="concat(ClassificationGroupName/#Descriptor, '|', ClassificationDescription)">
<xsl:value-of select="ClassificationGroupName/#Descriptor"/>
<xsl:value-of select="','"/>
<xsl:value-of select="ClassificationDescription"/>
<xsl:value-of select="$CRLF"/>
</xsl:for-each-group>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
Output:
ClassificationGroupName,ClassificationDescription
EEO-1 Job Categories,Professionals
Hartford Job Category,Other
EEO-1 Job Categories,Administrative Support Workers
Hartford Job Category,Other
Excepted output:
ClassificationGroupName,ClassificationDescription
EEO-1 Job Categories,Professionals
Hartford Job Category,Other
EEO-1 Job Categories,Administrative Support Workers
With the code that I have written, it removes duplicates only within the Report_Entry. I want to remove if there any other duplicate values with both ClassificationGroupName and ClassificationDescription of Classifications_group in any other Report_Entry as well.
What are the changes do I need to do to get the expected output?

I think you want to group all entries e.g. in XSLT 3 with a composite key
<xsl:template match="Report_Data">
<xsl:for-each-group select="Report_Entry/Classifications_group" composite="yes" group-by="ClassificationGroupName/#Descriptor, ClassificationDescription">
<xsl:value-of select="current-grouping-key()" separator=", "/>
<xsl:text>
</xsl:text>
</xsl:for-each-group>
</xsl:template>
or, with your XSLT 2 approach, using
<xsl:template match="Report_Data">
<xsl:value-of select="'ClassificationGroupName,ClassificationDescription'"/>
<xsl:value-of select="$CRLF"/>
<xsl:for-each-group select="Report_Entry/Classifications_group" group-by="concat(ClassificationGroupName/#Descriptor, '|', ClassificationDescription)">
<xsl:value-of select="ClassificationGroupName/#Descriptor"/>
<xsl:value-of select="','"/>
<xsl:value-of select="ClassificationDescription"/>
<xsl:value-of select="$CRLF"/>
</xsl:for-each-group>
</xsl:template>

Related

How to remove duplicate entry - XSLT

I am try to remove duplicate entry after entity § and if contains the , in entry and after tokenize the start-with the ( round bracket then entry e.g (17200(b)(2), (4)–(6)) s/b e.g (<p>17200(b)(2)</p><p>17200(b)(4)–(6)</p>).
Input XML
<root>
<p>CC §1(a), (b), (c)</p>
<p>Civil Code §1(a), (b)</p>
<p>CC §§2(a)</p>
<p>Civil Code §3(a)</p>
<p>CC §1(c)</p>
<p>Civil Code §1(a), (b), (c)</p>
<p>Civil Code §17200(b)(2), (4)–(6), (8), (12), (16), (20), and (21)</p>
</root>
Expected Output
<root>
<sec specific-use="CC">
<title content-type="Sta_Head3">CIVIL CODE</title>
<p>1(a)</p>
<p>1(b)</p>
<p>1(c)</p>
<p>2(a)</p>
<p>3(a)</p>
<p>17200(b)(2)</p>
<p>17200(b)(4)–(6)</p>
<p>17200(b)(8)</p>
<p>17200(b)(12)</p>
<p>17200(b)(16)</p>
<p>17200(b)(20)</p>
<p>17200(b)(21)</p>
</sec>
</root>
XSLT Code
<xsl:template match="root">
<xsl:copy>
<xsl:for-each-group select="p[(starts-with(., 'CC ') or starts-with(., 'Civil Code'))]" group-by="replace(substring-before(., ' §'), 'Civil Code', 'CC')">
<xsl:text>
</xsl:text>
<sec specific-use="{current-grouping-key()}">
<xsl:text>
</xsl:text>
<title content-type="Sta_Head3">CIVIL CODE</title>
<xsl:for-each-group select="current-group()" group-by="replace(substring-after(., '§'), '§', '')">
<xsl:sort select="replace(current-grouping-key(), '[^0-9.].*$', '')" data-type="number" order="ascending"/>
<xsl:for-each
select="distinct-values(
current-grouping-key() !
(let $tokens := tokenize(current-grouping-key(), ', and |, | and ')
return (head($tokens), tail($tokens) ! (substring-before(head($tokens), '(') || .)))
)" expand-text="yes">
<p>{.}</p>
</xsl:for-each>
</xsl:for-each-group>
</sec>
</xsl:for-each-group>
</xsl:copy>
</xsl:template>
You could do it like this, in a two-step approach where you first compute the list of existing elements and then use a for-each-group to remove duplicates.
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
exclude-result-prefixes="#all"
version="3.0">
<xsl:output method="xml" indent="yes"/>
<xsl:template match="/">
<xsl:variable name="listP">
<xsl:apply-templates select="root/p"/>
</xsl:variable>
<xsl:for-each-group select="$listP" group-by="p">
<p><xsl:value-of select="current-grouping-key()"/></p>
</xsl:for-each-group>
</xsl:template>
<xsl:template match="p">
<xsl:variable name="input" select="replace(substring-after(.,'§'),'§','')"/>
<xsl:variable name="chapter" select="substring-before($input,'(')"/>
<xsl:for-each select="tokenize(substring-after($input, $chapter),',')">
<p><xsl:value-of select="concat($chapter,replace(replace(.,' ',''),'and',''))"/></p>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
See it working here : https://xsltfiddle.liberty-development.net/gVrvcxQ

XSLT sort across nodes and text

I'm trying to find, sort, and output a string of copyright years. I've got a working bit of code, but I just found that some of my years are not in the same tags as others.
Initially I thought all my years were in the following tag: <copyright-year>2020</copyright-year>, see below for a working bit of code to find, sort, and output those.
I just found that some of my copyright years look like this: <copyright-statement>© 2017 Company. All rights reserved.</copyright-statement>.
I can find the years in these statements using //copyright-statement/substring(.,3,4). However, when I tried to search for both types like this: <xsl:for-each-group select="//copyright-year|copyright-statement/substring(., 3, 4)" group-by="text()">, it gives the following warning:
Required item type of document-order sorter is node(); supplied expression ((./copyright-statement)/(fn:substring(...))) has item type xs:string. The expression can succeed only if the supplied value is an empty sequence.
And obviously doesn't work. Any idea how to merge these two sets of years to get: <output>2020, 2019, 2017</output>?
Sample XML
<?xml version="1.0" encoding="UTF-8"?>
<book>
<book-meta>
<copyright-year>2020</copyright-year>
</book-meta>
<body>
<book-part>
<book-part-meta>
<copyright-year>2019</copyright-year>
</book-part-meta>
</book-part>
</body>
<back>
<copyright-statement>© 2017 Company. All rights reserved.</copyright-statement>
</back>
</book>
Sample XSLT
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
exclude-result-prefixes="xs"
version="2.0">
<xsl:template match="book">
<xsl:variable name="years">
<xsl:for-each-group select="//copyright-year" group-by="text()">
<xsl:sort select="." order="descending"/>
<xsl:value-of select="."/><xsl:if test="position() != last()"><xsl:text>, </xsl:text></xsl:if>
</xsl:for-each-group>
</xsl:variable>
<output><xsl:value-of select="$years"/></output>
</xsl:template>
</xsl:stylesheet>
Which version of which XSLT processor do you use? XSLT 3 has a sort function
<xsl:value-of select="reverse(sort(distinct-values((//copyright-year/xs:integer(.), //copyright-statement/xs:integer(substring(.,3,4))))))" separator=", "/>
https://xsltfiddle.liberty-development.net/bwdwsd
It might be easier to read that with the new => arrow operator:
<xsl:value-of
select="(//copyright-year/xs:integer(.), //copyright-statement/xs:integer(substring(.,3,4)))
=> distinct-values()
=> sort()
=> reverse()"
separator=", "/>
https://xsltfiddle.liberty-development.net/bwdwsd/2
But in general the step you need is to simply ensure you work with atomic values e.g. xs:integers seems the right value for years. I think in XSLT 2 I would wrap perform-sort into a function:
<xsl:function name="mf:sort" as="item()*">
<xsl:param name="input" as="item()*"/>
<xsl:perform-sort select="$input">
<xsl:sort order="descending"/>
</xsl:perform-sort>
</xsl:function>
<xsl:template match="book">
<xsl:value-of select="mf:sort(distinct-values((//copyright-year/xs:integer(.), //copyright-statement/xs:integer(substring(.,3,4)))))" separator=", "/>
</xsl:template>
https://xsltfiddle.liberty-development.net/bwdwsd/1
Here's one way to get the specify output;
XSLT 2.0
<xsl:stylesheet version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
exclude-result-prefixes="xs">
<xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
<xsl:template match="/book">
<xsl:variable name="years" as="xs:string*">
<xsl:perform-sort>
<xsl:sort select="." data-type="number" order="descending"/>
<xsl:apply-templates select="//(copyright-year | copyright-statement)"/>
</xsl:perform-sort>
</xsl:variable>
<output>
<xsl:value-of select="$years" separator=","/>
</output>
</xsl:template>
<xsl:template match="copyright-statement">
<xsl:value-of select="substring-before(substring-after(., '© '), ' ')"/>
</xsl:template>
</xsl:stylesheet>
Demo: https://xsltfiddle.liberty-development.net/bwdwsd/3

How to Identify specific group of employees in large set of employees and assign them Y/N flag

I am looking to match certain group of employees from input file to pass YES or NO flag. It's based on employee id
Below is my source XML and XSLT I am trying. In below source code for employees 12121 and 12123 I need to mark them as YES in Approver column else it should be NO. For this requirement I am trying to use compare function. However it's not giving me accruate result. Is there any other way ? any other function I can use of ?
<?xml version='1.0' encoding='UTF-8'?>
<Report_Data>
<Report_Entry>
<Employee_ID>12121</Employee_ID>
<Tax_State_Code>NY</Tax_State_Code>
</Report_Entry>
<Report_Entry>
<Employee_ID>12122</Employee_ID>
<Tax_State_Code>PA</Tax_State_Code>
</Report_Entry>
<Report_Entry>
<Employee_ID>12123</Employee_ID>
<Tax_State_Code>PA</Tax_State_Code>
</Report_Entry>
<Report_Entry>
<Employee_ID>12124</Employee_ID>
<Tax_State_Code>PA</Tax_State_Code>
</Report_Entry>
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs" version="2.0">
<xsl:variable name="comma">
<xsl:text>,</xsl:text>
</xsl:variable>
<xsl:variable name="nextline">
<xsl:text>
</xsl:text>
</xsl:variable>
<xsl:output method="text" omit-xml-declaration="yes"/>
<xsl:template match="/Report_Data">
<xsl:text>EmployeeID,Approver,TaxState</xsl:text>
<xsl:value-of select="$nextline"/>
<xsl:for-each select="Report_Entry">
<xsl:variable name="listA">
<xsl:text>"12121","12123"</xsl:text>
</xsl:variable>
<xsl:variable name="empid" select="Employee_ID"/>
<xsl:value-of select="$empid"/>
<xsl:value-of select="$comma"/>
<xsl:choose>
<xsl:when test="compare($empid, $listA)">
<xsl:text>YES</xsl:text>
</xsl:when>
<xsl:otherwise>NO</xsl:otherwise>
</xsl:choose>
<xsl:value-of select="$comma"/>
<xsl:value-of select="Tax_State_Code"/>
<xsl:value-of select="$nextline"/>
</xsl:for-each>
</xsl:template>
Expected Output
EmployeeID,Approver,TaxState
12121,YES,NY
12122,NO,PA
12123,YES,PA
12124,NO,PA
Actual output currently getting is
EmployeeID,Approver,TaxState
12121,YES,NY
12122,YES,PA
12123,YES,PA
12124,YES,PA
This stylesheet:
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="text" omit-xml-declaration="yes"/>
<xsl:template match="/Report_Data">
<xsl:value-of
select="'EmployeeID,Approver,TaxState',
Report_Entry
/string-join(
(Employee_ID,
if (Employee_ID=(12121, 12123))
then 'YES'
else 'NO',
Tax_State_Code),
','
)"
separator="
"/>
</xsl:template>
</xsl:stylesheet>
Output:
EmployeeID,Approver,TaxState
12121,YES,NY
12122,NO,PA
12123,YES,PA
12124,NO,PA
Do note: the compare function has a signature with two string arguments (and the 3-arity function with collation argument) and you are indeed comparing the Employee_ID elemnt string value with the $listA text node string value '"12121","12123"'. For a one to many comparison you need to use the = operator.

Sort the info based on given sequence order

Please suggest how to sort the info, based on given sort sequence (Seq.xml).
Here fetching info from folder where files' name should have 'Sort##.xml' formats. Output should be sort based on DOI number as sequnce given in external file 'D:\Sort\Seq.xml'.
Input XMLs:
D:\Sort\Sort01.xml
<article>
<fm>
<title>The solar system</title>
<aug><au>Rudramuni TP</au></aug>
<doi>10.11/MPS.0.10.11</doi>
</fm>
<body><text>The text</text></body>
</article>
D:\Sort\Sort02.xml
<article>
<fm>
<title>The Galaxy</title>
<aug><au>Kishan TR</au></aug>
<doi>10.11/MPS.0.10.2</doi>
</fm>
<body><text>The text</text></body>
</article>
D:\Sort\Sort03.xml
<article>
<fm>
<title>The Pluto</title>
<aug><au>Kowshik MD</au></aug>
<doi>10.11/MPS.0.10.10</doi>
</fm>
<body><text>The text</text></body>
</article>
Sequence info in D:\Sort\Seq.xml
<root>
<toc>
<seq seq="1"><art-id>10.11/MPS.0.10.2</art-id></seq>
<seq seq="2"><art-id>10.11/MPS.0.10.11</art-id></seq>
<seq seq="3"><art-id>10.11/MPS.0.10.10</art-id></seq>
</toc>
</root>
XSLT:
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output omit-xml-declaration="yes"/>
<xsl:variable name="varCollection">
<xsl:copy-of select="
collection('file:///D:/Sort/?select=Sort*.xml; recurse=yes')
[matches(document-uri(.),'Sort/Sort[0-9][0-9].xml')]"/>
</xsl:variable><!-- to fetch info from folder 'Sort*.xml' s -->
<xsl:variable name="docSeq" select="document('D:/Sort/Seq.xml')"/><!--In this file, sequnce info is there -->
<!--xsl:key name="kSeq" match="$docSeq/root/toc/seq/#seq" use="art-id"/--><!-- I tried with key, but unable to get the required sequence -->
<xsl:template match="root">
<xsl:for-each select="$varCollection/article">
<!--xsl:sort select="key('kSeq', fm/doi)"/-->
<art>
<title><xsl:value-of select="fm/title"/></title>
<Name><xsl:value-of select="fm/aug/au"/></Name>
<DOI><xsl:value-of select="fm/doi"/></DOI>
</art><xsl:text>
</xsl:text>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
Required Sequence
<art><title>The Galaxy</title><Name>Kishan TR</Name><DOI>10.11/MPS.0.10.2</DOI></art>
<art><title>The solar system</title><Name>Rudramuni TP</Name><DOI>10.11/MPS.0.10.11</DOI></art>
<art><title>The Pluto</title><Name>Kowshik MD</Name><DOI>10.11/MPS.0.10.10</DOI></art>
I think you want
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema" exclude-result-prefixes="xs">
<xsl:output omit-xml-declaration="yes" indent="yes"/>
<xsl:variable name="varCollection" select="
collection('file:///D:/Sort/?select=Sort*.xml; recurse=yes')
[matches(document-uri(.),'Sort/Sort[0-9][0-9].xml')]"/>
<xsl:variable name="docSeq" select="document('file:///D:/Sort/Seq.xml')"/><!--In this file, sequnce info is there -->
<xsl:key name="kSeq" match="root/toc/seq" use="art-id"/>
<xsl:template match="root">
<xsl:for-each select="$varCollection/article">
<xsl:sort select="key('kSeq', fm/doi, $docSeq)/xs:integer(#seq)"/>
<art>
<title><xsl:value-of select="fm/title"/></title>
<Name><xsl:value-of select="fm/aug/au"/></Name>
<DOI><xsl:value-of select="fm/doi"/></DOI>
</art>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
You could also do this from the opposite direction:
XSLT 2.0
<xsl:stylesheet version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output omit-xml-declaration="yes"/>
<xsl:variable name="varCollection">
<xsl:sequence select="collection('file:///D:/Sort/?select=Sort*.xml; recurse=yes')
[matches(document-uri(.),'Sort/Sort[0-9][0-9].xml')]"/>
</xsl:variable>
<xsl:variable name="docSeq" select="document('D:/Sort/Seq.xml')"/>
<xsl:key name="article" match="article" use="fm/doi" />
<xsl:template match="root">
<xsl:for-each select="$docSeq/root/toc/seq">
<xsl:sort select="#seq" data-type="number" order="ascending"/>
<xsl:apply-templates select="key('article', art-id, $varCollection)"/>
</xsl:for-each>
</xsl:template>
<xsl:template match="article">
<art>
<title><xsl:value-of select="fm/title"/></title>
<Name><xsl:value-of select="fm/aug/au"/></Name>
<DOI><xsl:value-of select="fm/doi"/></DOI>
</art>
</xsl:template>
</xsl:stylesheet>
Note that the output lacks a root element.

XSLT: Replace string with Abbreviations

I would like to know how to replace the string with the abbreviations.
My XML looks like below
<concept reltype="CONTAINS" name="Left Ventricular Major Axis Diastolic Dimension, 4-chamber view" type="NUM">
<code meaning="Left Ventricular Major Axis Diastolic Dimension, 4-chamber view" value="18074-5" schema="LN" />
<measurement value="5.7585187646">
<code value="cm" schema="UCUM" />
</measurement>
<content>
<concept reltype="HAS ACQ CONTEXT" name="Image Mode" type="CODE">
<code meaning="Image Mode" value="G-0373" schema="SRT" />
<code meaning="2D mode" value="G-03A2" schema="SRT" />
</concept>
</content>
</concept>
and I am selecting some value from the xml like,
<xsl:value-of select="concept/measurement/code/#value"/>
Now what I want is, I have to replace cm with centimeter. I have so many words like this. I would like to have a xml for abbreviations and replace from them.
I saw one similar example here.
Using a Map in XSL for expanding abbreviations
But it replaces node text, but I have text as attribute. Also, it would be better for me If I can find and replace when I select text using xsl:valueof select instead of having a separate xsl:template. Please help. I am new to xslt.
I have created XSLT v "1.1". For abbreviations I have created XML file as you have mentioned:
Abbreviation.xml:
<Abbreviations>
<Abbreviation>
<Short>cm</Short>
<Full>centimeter</Full>
</Abbreviation>
<Abbreviation>
<Short>m</Short>
<Full>meter</Full>
</Abbreviation>
</Abbreviations>
XSLT:
<xsl:stylesheet version="1.1" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output indent="yes" method="xml" />
<xsl:param name="AbbreviationDoc" select="document('Abbreviation.xml')"/>
<xsl:template match="/">
<xsl:call-template name="Convert">
<xsl:with-param name="present" select="concept/measurement/code/#value"/>
</xsl:call-template>
</xsl:template>
<xsl:template name="Convert">
<xsl:param name="present"/>
<xsl:choose>
<xsl:when test="$AbbreviationDoc/Abbreviations/Abbreviation[Short = $present]">
<xsl:value-of select="$AbbreviationDoc/Abbreviations/Abbreviation[Short = $present]/Full"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$present"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
</xsl:stylesheet>
INPUT:
as you have given <xsl:value-of select="concept/measurement/code/#value"/>
OUTPUT:
centimeter
You just need to enhance this Abbreviation.xml to keep short and full value of abbreviation and call 'Convert' template with passing current value to get desired output.
Here a little shorter version:
- with abbreviations in xslt file
- make use of apply-templates with mode to make usage shorter.
But with xslt 1.0 node-set extension is required.
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:exsl="http://exslt.org/common"
extension-element-prefixes="exsl">
<xsl:output method="xml" indent="yes"/>
<xsl:variable name="abbreviations_txt">
<abbreviation abbrev="cm" >centimeter</abbreviation>
<abbreviation abbrev="m" >meter</abbreviation>
</xsl:variable>
<xsl:variable name="abbreviations" select="exsl:node-set($abbreviations_txt)" />
<xsl:template match="/">
<xsl:apply-templates select="concept/measurement/code/#value" mode="abbrev_to_text"/>
</xsl:template>
<xsl:template match="* | #*" mode="abbrev_to_text">
<xsl:variable name="abbrev" select="." />
<xsl:variable name="long_text" select="$abbreviations//abbreviation[#abbrev = $abbrev]/text()" />
<xsl:value-of select="$long_text"/>
<xsl:if test="not ($long_text)">
<xsl:value-of select="$abbrev"/>
</xsl:if>
</xsl:template>
</xsl:stylesheet>