I want to examine a data set of a web shop. The data set includes the number of visits and the number of orders including some personal data.
I want to find out, on which values a "high order rate" depends. My idea was to use a decision tree for that because the rules are easy to understand and you can easily take action according to the results.
I created a new column called "abortion rate". Its the relation of visits and orders. Everything above 50 % is high, the others are low.
After that I split the data in training and test data and generate a model based on a decision tree.
My problem is, that there are just "a few" records with a low abortion rate. The decision tree seems to work fine but actually it declares nearly all records as high, just 5 records with low abortion rate are correct.
I don't really know how to figure this out. Any ideas?
Thanks in advance.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="5.3.015" expanded="true" height="60" name="Read CSV" width="90" x="45" y="30">
<parameter key="csv_file" value="C:\temp\WebData.csv"/>
<parameter key="column_separators" value=";"/>
<parameter key="trim_lines" value="false"/>
<parameter key="use_quotes" value="true"/>
<parameter key="quotes_character" value="""/>
<parameter key="escape_character" value="\"/>
<parameter key="skip_comments" value="false"/>
<parameter key="comment_characters" value="#"/>
<parameter key="parse_numbers" value="true"/>
<parameter key="decimal_character" value=","/>
<parameter key="grouped_digits" value="false"/>
<parameter key="grouping_character" value=","/>
<parameter key="date_format" value=""/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="ID.true.integer.attribute"/>
<parameter key="1" value="Age.true.integer.attribute"/>
<parameter key="2" value="Sex.true.polynominal.attribute"/>
<parameter key="3" value="Income.true.polynominal.attribute"/>
<parameter key="4" value="PersonsHoushold.true.integer.attribute"/>
<parameter key="5" value="ShippingFlat.true.binominal.attribute"/>
<parameter key="6" value="OneKlickBuy.true.binominal.attribute"/>
<parameter key="7" value="CustomerSince.true.real.attribute"/>
<parameter key="8" value="Visits.true.integer.attribute"/>
<parameter key="9" value="Orders.true.integer.attribute"/>
<parameter key="10" value="att11.false.attribute_value.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
<parameter key="datamanagement" value="double_array"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.015" expanded="true" height="76" name="Generate Attributes" width="90" x="179" y="30">
<list key="function_descriptions">
<parameter key="AbortionRate" value="(Visits-Orders)/Visits"/>
</list>
<parameter key="use_standard_constants" value="true"/>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.015" expanded="true" height="76" name="Generate Attributes (2)" width="90" x="313" y="30">
<list key="function_descriptions">
<parameter key="AbortionRate_Bio" value="(if(AbortionRate<0.5, "low","high"))"/>
</list>
<parameter key="use_standard_constants" value="true"/>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.015" expanded="true" height="76" name="Set Role" width="90" x="447" y="30">
<parameter key="attribute_name" value="AbortionRate_Bio"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="581" y="30">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="Income|Sex|OneKlickBuy|ShippingFlat|Age|PersonsHoushold|AbortionRate_Bio|CustomerSince"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="split_data" compatibility="5.3.015" expanded="true" height="94" name="Split Data" width="90" x="715" y="30">
<enumeration key="partitions">
<parameter key="ratio" value="0.7"/>
<parameter key="ratio" value="0.3"/>
</enumeration>
<parameter key="sampling_type" value="shuffled sampling"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="decision_tree" compatibility="5.3.015" expanded="true" height="76" name="Decision Tree" width="90" x="447" y="210">
<parameter key="criterion" value="gain_ratio"/>
<parameter key="minimal_size_for_split" value="4"/>
<parameter key="minimal_leaf_size" value="2"/>
<parameter key="minimal_gain" value="0.02"/>
<parameter key="maximal_depth" value="20"/>
<parameter key="confidence" value="0.25"/>
<parameter key="number_of_prepruning_alternatives" value="3"/>
<parameter key="no_pre_pruning" value="false"/>
<parameter key="no_pruning" value="false"/>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model" width="90" x="648" y="210">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Decision Tree" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
<connect from_op="Apply Model" from_port="model" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
UPDATE:
I changed my Rapidminer process to this:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="5.3.015" expanded="true" height="60" name="Read CSV" width="90" x="45" y="30">
<parameter key="csv_file" value="C:\temp\WebData.csv"/>
<parameter key="decimal_character" value=","/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="ID.true.integer.attribute"/>
<parameter key="1" value="Age.true.integer.attribute"/>
<parameter key="2" value="Sex.true.polynominal.attribute"/>
<parameter key="3" value="Income.true.polynominal.attribute"/>
<parameter key="4" value="PersonsHoushold.true.integer.attribute"/>
<parameter key="5" value="ShippingFlat.true.binominal.attribute"/>
<parameter key="6" value="OneKlickBuy.true.binominal.attribute"/>
<parameter key="7" value="CustomerSince.true.real.attribute"/>
<parameter key="8" value="Visits.true.integer.attribute"/>
<parameter key="9" value="Orders.true.integer.attribute"/>
<parameter key="10" value="att11.false.attribute_value.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.015" expanded="true" height="76" name="Generate Attributes" width="90" x="179" y="30">
<list key="function_descriptions">
<parameter key="A_AbortionRate" value="(Visits-Orders)/Visits"/>
</list>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.015" expanded="true" height="76" name="Generate Attributes (2)" width="90" x="313" y="30">
<list key="function_descriptions">
<parameter key="A_AbortionRate_Bio" value="(if(AbortionRate<0.5, "low","high"))"/>
<parameter key="A_Age" value="(if(Age<18,"Teen",if(Age<30,"young Adult",if(Age<60,"Adult","Pensioner"))))"/>
<parameter key="A_CustomerSince" value="(if(CustomerSince<=1,"up to 1 Year",if(CustomerSince<=2,"1-2 Years",if(CustomerSince<=3,"2-3 Years","over 3 Years"))))"/>
<parameter key="A_PersonsHoushold" value="(if(PersonsHoushold<=1,"Single",if(PersonsHoushold<=2,"Pair",if(PersonsHoushold<=5,"Family","extended family"))))"/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.015" expanded="true" height="76" name="Set Role" width="90" x="447" y="30">
<parameter key="attribute_name" value="A_AbortionRate_Bio"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles">
<parameter key="K_ID" value="id"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="581" y="30">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="A_AbortionRate_Bio|A_Aage|A_CustomerSince|Income|Sex|OneKlickBuy|ShippingFlat|A_PersonsHoushold"/>
</operator>
<operator activated="true" class="split_validation" compatibility="5.3.015" expanded="true" height="130" name="Validation" width="90" x="715" y="30">
<parameter key="sampling_type" value="stratified sampling"/>
<process expanded="true">
<operator activated="true" class="multiply" compatibility="5.3.015" expanded="true" height="94" name="Multiply (2)" width="90" x="45" y="30"/>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (3)" width="90" x="179" y="120">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="A_AbortionRate_Bio = high"/>
</operator>
<operator activated="true" class="sample" compatibility="5.3.015" expanded="true" height="76" name="Sample (4)" width="90" x="313" y="120">
<parameter key="sample_size" value="1675"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (4)" width="90" x="179" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="A_AbortionRate_Bio = low"/>
</operator>
<operator activated="true" class="sample" compatibility="5.3.015" expanded="true" height="76" name="Sample" width="90" x="313" y="30">
<parameter key="sample_size" value="1675"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="true" class="append" compatibility="5.3.015" expanded="true" height="94" name="Append (2)" width="90" x="447" y="75"/>
<operator activated="true" class="decision_tree" compatibility="5.3.015" expanded="true" height="76" name="Decision Tree (2)" width="90" x="581" y="75">
<parameter key="criterion" value="information_gain"/>
<parameter key="minimal_gain" value="0.025"/>
<parameter key="maximal_depth" value="6"/>
</operator>
<connect from_port="training" to_op="Multiply (2)" to_port="input"/>
<connect from_op="Multiply (2)" from_port="output 1" to_op="Filter Examples (4)" to_port="example set input"/>
<connect from_op="Multiply (2)" from_port="output 2" to_op="Filter Examples (3)" to_port="example set input"/>
<connect from_op="Filter Examples (3)" from_port="example set output" to_op="Sample (4)" to_port="example set input"/>
<connect from_op="Sample (4)" from_port="example set output" to_op="Append (2)" to_port="example set 2"/>
<connect from_op="Filter Examples (4)" from_port="example set output" to_op="Sample" to_port="example set input"/>
<connect from_op="Sample" from_port="example set output" to_op="Append (2)" to_port="example set 1"/>
<connect from_op="Append (2)" from_port="merged set" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model (2)" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.015" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
<connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
<portSpacing port="sink_averagable 3" spacing="0"/>
</process>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="training" to_port="result 2"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 3"/>
<connect from_op="Validation" from_port="averagable 2" to_port="result 4"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
The results are better but the are still bad.
class prediction is 91.53 % (for high) and 33.47 % (for low). And low is the interesting part... :)
Any ideas?
Related
I am a new in Rapidminer, so i have a huge dataset and i use Principle component analysis to reduce dimensionality, the problem is when i get the PCs i do not know how to select the records depend on it how can i make a new dataset which is reduced ?
this what i am tried to use :
and this what i get :
you could use the "Weight by PCA" operator to calculate weights for attribute importance and then use the "Select by Weights" operator to reduce the number of attributes in your original data set.
Check the attached example process below (just c&p the XML into your RapidMiner process window).
Also feel free to take a look or ask questions at the RapidMiner community
<?xml version="1.0" encoding="UTF-8"?><process version="9.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Root" origin="GENERATED_TUTORIAL">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.2.000" expanded="true" height="68" name="Sonar" origin="GENERATED_TUTORIAL" width="90" x="112" y="34">
<parameter key="repository_entry" value="//Samples/data/Sonar"/>
</operator>
<operator activated="true" class="weight_by_pca" compatibility="9.2.000" expanded="true" height="82" name="Weight by PCA" width="90" x="313" y="34">
<parameter key="normalize_weights" value="true"/>
<parameter key="sort_weights" value="true"/>
<parameter key="sort_direction" value="ascending"/>
<parameter key="component_number" value="1"/>
</operator>
<operator activated="true" class="select_by_weights" compatibility="9.2.000" expanded="true" height="103" name="Select by Weights" width="90" x="581" y="34">
<parameter key="weight_relation" value="greater equals"/>
<parameter key="weight" value="0.5"/>
<parameter key="k" value="10"/>
<parameter key="p" value="0.5"/>
<parameter key="deselect_unknown" value="true"/>
<parameter key="use_absolute_weights" value="true"/>
</operator>
<connect from_op="Sonar" from_port="output" to_op="Weight by PCA" to_port="example set"/>
<connect from_op="Weight by PCA" from_port="weights" to_op="Select by Weights" to_port="weights"/>
<connect from_op="Weight by PCA" from_port="example set" to_op="Select by Weights" to_port="example set input"/>
<connect from_op="Select by Weights" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="162"/>
</process>
</operator>
</process>
My Problem is that if I have a missing value in a row, I want to replace this value with another one from that row. For example I want to replace the missing value with the appropriate "Belegnummer"
in general there is a operator called Replace Missing Values which does exactly what the name suggests.
In your special case you want to access the values of another attribute (column), so the Generate Attributes operator offers a very powerful expression builder where you can declare an If-statement of that form if(a1==MISSING_NUMERIC, a2,a1)
See the screenshot above for an example or copy&paste the process XML into your RapidMiner process window.
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.000-BETA">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.000-BETA" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="subprocess" compatibility="9.0.000-BETA" expanded="true" height="82" name="Subprocess" width="90" x="112" y="34">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.0.000-BETA" expanded="true" height="68" name="Retrieve Iris" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="declare_missing_value" compatibility="9.0.000-BETA" expanded="true" height="82" name="Declare Missing Value" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="a1"/>
<parameter key="mode" value="expression"/>
<parameter key="expression_value" value="a1 <5"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.0.000-BETA" expanded="true" height="82" name="Select Attributes" width="90" x="380" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="a2|a1"/>
</operator>
<connect from_op="Retrieve Iris" from_port="output" to_op="Declare Missing Value" to_port="example set input"/>
<connect from_op="Declare Missing Value" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="generate_attributes" compatibility="9.0.000-BETA" expanded="true" height="82" name="Generate Attributes" width="90" x="447" y="34">
<list key="function_descriptions">
<parameter key="a1_new" value="if(a1==MISSING_NUMERIC, a2,a1)"/>
</list>
</operator>
<connect from_op="Subprocess" from_port="out 1" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<description align="center" color="yellow" colored="false" height="181" resized="true" width="529" x="275" y="126">With the expression parser more complex statements can be defined. In this case:<br>if(a1==MISSING_NUMERIC, a2,a1)<br/><br/>meaning that if the value of attribute a1 is missing, it will be replaced by the value of a2 otherwise the value of a1 is kept.<br/><br/>Instead of creating a new attribute the old one can also be overwritten<br/><br></description>
</process>
</operator>
</process>
I am having a challenge with using Rapid Miner to reduce the feature dimensions for text mining. at this point i am processing the text by word tokens and it is resulting in a very big dimension set that is not ideal for modeling and prediction.
how can i improve the process to use other methods to clean the data and only take on relevant words?
i have tried applying tfidf but it removes the target variable and i am not able to see what it does before the model stage.
Thanks
The Process Documents operator has a pruning option where, with some careful setting of parameters, you can remove common and rare attributes.
Here's a toy example to show it working.
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:create_document" compatibility="7.4.001" expanded="true" height="68" name="Create Document" width="90" x="179" y="187">
<parameter key="text" value="the cat sat on the mat
the dog barked at the man
the cow ate the grass
the man sat on the grass"/>
</operator>
<operator activated="true" class="text:create_document" compatibility="7.4.001" expanded="true" height="68" name="Create Document (2)" width="90" x="179" y="289">
<parameter key="text" value="the cat sat on the mat
the man sat on the grass
the rain in spain falls mainly on the plain"/>
</operator>
<operator activated="true" class="text:create_document" compatibility="7.4.001" expanded="true" height="68" name="Create Document (3)" width="90" x="179" y="391">
<parameter key="text" value="the world is round"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="7.4.001" expanded="true" height="145" name="Process Documents" width="90" x="447" y="187">
<parameter key="vector_creation" value="Term Occurrences"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_above_percent" value="40.0"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="5"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="246" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Create Document (2)" from_port="output" to_op="Process Documents" to_port="documents 2"/>
<connect from_op="Create Document (3)" from_port="output" to_op="Process Documents" to_port="documents 3"/>
<connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
It requires some care to get it just right but hopefully this will get you started.
I have some tweets data in a csv file and I want to (1) extract only the hashtags (with special characters like ##cloudcomputing, #cloud_computing, #101Cloud, etc and considering multiple case representation as one hashtag like #edtech, #Edtech, #EdTech,etc), (2) group the tweets based on hashtags and (3) count the number of times each hashtag word occurred in a document or across documents of a corpus. I understand that I have to use regular expressions in the Filter Examples operator on the 'Title' column but don't know how to.
Some regular expressions I tried to use but which failed are: attribute name (Title) = regular expression
(?i)#.*
^#
/(#\w+)/u
/(#[a-z0-9][a-z0-9-_]*)/ig
Here is the link to the XML code of the process I have created
https://myexplorations.sharefile.com/d-sefddef555cf4d61a
Here is the link to the source data file
https://myexplorations.sharefile.com/d-s92197963830466cb
Could someone help please?
Both the files are in XML format, but not in a standard RapidMiner format, one of them looks a bit like MS Word, is that right?
Anyway, feel free to repost the data in a different format, but I think this might help.
First make sure you have the Text Processing Extension from RapidMiner.
Next use Process Documents from Data and inside it use the following 3 operators: Transform Cases, Cut Document & Combine Documents. What these do is for each example in your CSV make the text lower case, extract the hashtags individually from the text & then combine them into a new document (in case there are more than one hashtag in a piece of text).
The regEx I used was (?i)#[0-9a-z_]*, this was just for speed, but it should capture all the cases I could think of.
The output of this process is a wordlist count across the corpus telling you how many times a hashtag occured in a document. That should get you started.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="7.0.001" expanded="true" height="68" name="Read CSV" width="90" x="45" y="136">
<parameter key="csv_file" value="myCSV"/>
<parameter key="column_separators" value=","/>
<list key="annotations"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="myTextColum.true.text.regular"/>
<parameter key="1" value="anotherColumn.true.nominal.regular"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.0.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="179" y="85">
<parameter key="vector_creation" value="Term Occurrences"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="7.0.000" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="34">
<description align="center" color="transparent" colored="false" width="126">Makes everything lowercase</description>
</operator>
<operator activated="true" class="text:cut_document" compatibility="7.0.000" expanded="true" height="68" name="Cut Document" width="90" x="179" y="34">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="hashtags" value="(?i)#[0-9a-z_]*"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
<process expanded="true">
<connect from_port="segment" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Gets rid of everything but the hashtags</description>
</operator>
<operator activated="true" class="text:combine_documents" compatibility="7.0.000" expanded="true" height="82" name="Combine Documents" width="90" x="313" y="34"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Cut Document" to_port="document"/>
<connect from_op="Cut Document" from_port="documents" to_op="Combine Documents" to_port="documents 1"/>
<connect from_op="Combine Documents" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="21"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
I'm trying to transform my xml file:
<root>
<group id="F_123" >
<term id="F_123_d" >
<word>blabla</word>
<instruction>blabla</instruction>
</term>
<term id="F_123">
<word>blabla</word>
<instruction>blabla</instruction>
<numbers>
<number code="01" >1</number>
<number code="02" >2</number>
<number code="03" >3</number>
<number code="04" >4</number>
<number code="05" >5</number>
</numbers>
</term>
<term id="F_124">
<word>blabla</word>
<numbers>
<number code="01" >1</number>
<number code="02" >2</number>
<number code="03" >3</number>
<number code="04" >4</number>
<number code="05" >5</number>
</numbers>
</term>
<term id="F_125">
<word>blabla</word>
<numbers>
<number code="01" >1</number>
<number code="02" >2</number>
<number code="03" >3</number>
<number code="04" >4</number>
<number code="05" >5</number>
</numbers>
</term>
<routing id="F_123_1">
<condition>
<operator type="or">
<operator type="or">
<operator type="equal">
<variable name="F_D01a3DE1"/>
<constant>DK</constant>
</operator>
<operator type="equal">
<variable name="F_D01a3DE1"/>
<constant>RF</constant>
</operator>
</operator>
<operator type="equal">
<variable name="F_D01a3DE1"/>
<constant>1</constant>
</operator>
</operator>
</condition>
<then>
<goto group="A_24"/>
</then>
<else>
<routing>
<condition>
<operator type="or">
<operator type="or">
<operator type="equal">
<variable name="B_D01a3DE1"/>
<constant>5</constant>
</operator>
<operator type="equal">
<variable name="B_D01a3DE1"/>
<constant>10</constant>
</operator>
</operator>
<operator type="equal">
<variable name="B_D01a3DE1"/>
<constant>7</constant>
</operator>
</operator>
</condition>
<then>
<goto group="A_25"/>
</then>
<else>
<routing>
<condition>
<operator type="or">
<operator type="equal">
<variable name="B_D01a3DE1"/>
<constant>6</constant>
</operator>
<operator type="equal">
<variable name="B_D01a3DE1"/>
<constant>11</constant>
</operator>
</operator>
</condition>
<then>
<goto group="A_26"/>
</then>
<else>
<goto group="A_27"/>
</else>
</routing>
</else>
</routing>
</else>
</routing>
</group>
<group id="A_25" >
<term id="A_25" >
<word>blabla</word>
<instruction>blabla</instruction>
</term>
<term id="A_26">
<word>blabla</word>
<instruction>blabla</instruction>
<numbers>
<number code="01" >1</number>
<number code="02" >2</number>
</numbers>
</term>
</group>
</root>
I want to access the value of #group/term/#id and make one element per each term in <group id="A_25">. Is it possible?
It's not very clear what exactly do you want, so a guess follows:
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:template match="group/term/#id">
<id>
<xsl:value-of select="."/>
</id>
</xsl:template>
<xsl:template match="/">
<root>
<xsl:apply-templates select="node()" />
</root>
</xsl:template>
<xsl:template match="node()|#*">
<xsl:apply-templates select="node()|#*" />
</xsl:template>
</xsl:stylesheet>
When applied to your example (I had to add a root element to it, so that it's valid), it produces:
<root>
<id>F_123_d</id>
<id>F_123</id>
<id>F_124</id>
<id>F_125</id>
<id>A_25</id>
<id>A_26</id>
</root>
I want to access the value of
#group/term/#id and make one element
per each term in <group id="A_25">. Is
it possible?
This XPath expression select what I think you want:
/root/group[#id='A_25']/term/#id
Also, this stylesheet process what I think you want:
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template match="text()"/>
<xsl:template match="group[#id='A_25']/term">
<element id="{#id}"/>
</xsl:template>
</xsl:stylesheet>
Output:
<element id="A_25" />
<element id="A_26" />