Import of the Unified State Register of Legal Entities of the Federal Tax Service by means of Apache NiFi. Step 3 - converting JSON using JOLT

In one of the projects, it became necessary to transfer the processes of importing data from third-party systems to a microservice architecture. Apache NiFi is selected as the tool. The import of the Unified State Register of Legal Entities of the Federal Tax Service was chosen as the first test subject.





The previous article described a way to convert XML to JSON using the AVRO schema.





This article describes a way to transform JSON using the JOLT specification.





Used processors and controllers

Dividing JSON into Pieces

The FlowFile obtained at the previous stage contains JSON with an array of USRLE statements for different organizations. First, let's split it into parts so that each FlowFile contains one statement.





For this we use the SplitJson processor . From settings - you need to specify a JsonPath expression to split json into parts. In this case, $. *





JsonPath documentation here





You can practice here





JSON conversion

The resulting JSON has an unnecessarily complex structure in order to store and process it later. It is better to combine the address and full name in one line, move some elements higher in the hierarchy.





JSON before transformation
{
  "reportDate" : "2020-05-20",
  "ogrn" : "1234567890123",
  "ogrnDate" : "2002-12-30",
  "inn" : "1234567890",
  "kpp" : "123456789",
  "opfCode" : "12300",
  "opfName" : "   ",
  "name" : {
    "fullName" : "   ",
    "shortName" : ""
  },
  "address" : {
    "addressRF" : {
      "region" : {
        "type" : "",
        "name" : ""
      },
      "district" : null,
      "town" : {
        "type" : "",
        "name" : ""
      },
      "settlement" : null,
      "street" : {
        "type" : "",
        "name" : ""
      },
      "index" : "143500",
      "regionCode" : "50",
      "kladr" : "500000570000011",
      "house" : null,
      "building" : null,
      "apartment" : null
    }
  },
  "termination" : null,
  "capital" : null,
  "manageOrg" : null,
  "director" : [ {
    "fl" : {
      "lastName" : "",
      "firstName" : "",
      "patronymic" : "",
      "inn" : "123456789012"
    },
    "position" : {
      "ogrnip" : null,
      "typeCode" : "02",
      "typeName" : "  ",
      "name" : " "
    },
    "disqualification" : null
  } ],
  "founders" : {
    "founderULRF" : null,
    "founderULForeign" : null,
    "founderFL" : [ {
      "fl" : {
        "lastName" : "",
        "firstName" : "",
        "patronymic" : "",
        "inn" : "123456789012"
      },
      "capitalPart" : {
        "nominal" : 20000.0,
        "size" : {
          "percent" : 50.0,
          "decimalPart" : null,
          "simplePart" : null
        }
      }
    }, {
      "fl" : {
        "lastName" : "",
        "firstName" : "",
        "patronymic" : "",
        "inn" : "123456789021"
      },
      "capitalPart" : {
        "nominal" : 20000.0,
        "size" : {
          "percent" : 50.0,
          "decimalPart" : null,
          "simplePart" : null
        }
      }
    } ],
    "founderGov" : null,
    "founderPIF" : null
  },
  "capitalPart" : null,
  "holderReestrAO" : null,
  "okved" : {
    "mainOkved" : {
      "code" : "47.11",
      "name" : "    ,  ,      "
    },
    "addOkved" : null
  }
}
      
      



JSON JoltTransformJSON.





:





  • Jolt Transformation DSL - . Chain -





  • Jolt Specification - .





JOLT

, - .





.





- shift - modify-overwrite-beta. . Modifier.java, . jolt-demo.appspot.com . .





JOLT
[
	{
		"operation": "modify-overwrite-beta",
		"spec": {
			"address": {
				"addressRF": {
					"region": "=concat(@(type), ' ', @(name))",
					"district": "=concat(@(type), ' ', @(name))",
					"town": "=concat(@(type), ' ', @(name))",
					"settlement": "=concat(@(type), ' ', @(name))",
					"street": "=concat(@(type), ' ', @(name))"
				}
			},
			"director": {
				"*": {
					"fl": {
						"fio": "=concat(@(1,lastName), ' ', @(1,firstName), ' ', @(1,patronymic))"
					}
				}
			},
			"founders": {
				"founderFL": {
					"*": {
						"fl": {
							"fio": "=concat(@(1,lastName), ' ', @(1,firstName), ' ', @(1,patronymic))"
						}
					}
				},
				"founderGov": {
					"*": {
						"founderImplFL": {
							"fl": {
								"fio": "=concat(@(1,lastName), ' ', @(1,firstName), ' ', @(1,patronymic))"
							}
						}
					}
				}
			}
		}
	},
	{
		"operation": "modify-overwrite-beta",
		"spec": {
			"address": {
				"addressRF": {
					"value": "=concat(@(1,index), ', ', @(1,region), ', ', @(1,district), ', ', @(1,town), ', ', @(1,settlement), ', ', @(1,street), ', ', @(1,house), ', ', @(1,building), ', ', @(1,apartment))",
					"fias": null
				}
			}
		}
	},
	{
		"operation": "shift",
		"spec": {
			"reportDate|ogrn|ogrnDate|inn|kpp|opfCode|opfName": "&",
			"name": {
				"*": "&"
			},
			"address": {
				"addressRF": {
					"kladr|regionCode|value|fias": "&2.&"
				}
			},
			"termination": {
				"method": {
					"*": "&2.&"
				},
				"*": "&1.&"
			},
			"capital": "&",
			"manageOrg": {
				"egrulData": {
					"*": "&2.&"
				}
			},
			"director": {
				"*": {
					"fl": {
						"fio|inn": "&3[&2].&"
					},
					"position": {
						"name": "&3[&2].&1",
						"*": "&3[&2].&"
					},
					"disqualification": "&2[&1].&"
				}
			},
			"founders": {
				"founderULRF|founderULForeign": {
					"*": {
						"egrulData|foreignReg": {
							"*": "&4.&3[&2].&"
						},
						"*": "&3.&2[&1].&"
					}
				},
				"founderFL": {
					"*": {
						"fl": {
							"fio|inn": "&4.&3[&2].&"
						},
						"*": "&3.&2[&1].&"
					}
				},
				"founderGov": {
					"*": {
						"govOrg": {
							"*": "&4.&3[&2].&"
						},
						"capitalPart": "&3.&2[&1].&",
						"founderImplUL": {
							"egrulData": {
								"*": "&5.&4[&3].&2.&"
							}
						},
						"founderImplFL": {
							"fl": {
								"fio|inn": "&5.&4[&3].&2.&"
							}
						}
					}
				},
				"founderPIF": {
					"*": {
						"PIFName": {
							"name": "&4.&3[&2].&1"
						},
						"manageOrg": {
							"egrulData": {
								"*": "&5.&4[&3].&"
							}
						},
						"capitalPart": "&3.&2[&1].&"
					}
				}
			},
			"capitalPart": "&",
			"holderReestrAO": {
				"egrulData": {
					"*": "&2.&"
				}
			},
			"okved": "&"
		}
	}
]
      
      



modify-overwrite-beta , .. .





, : - modify-overwrite-beta - shift. - operation - spec.





, .





modify-overwrite-beta

. , . , .





.





(. modify-overwrite-beta) - type name region, district, town, settlement street. "=concat(@(type), ' ', @(name))"



.





"address": {
				"addressRF": {
					"region": "=concat(@(type), ' ', @(name))",
					"district": "=concat(@(type), ' ', @(name))",
					"town": "=concat(@(type), ' ', @(name))",
					"settlement": "=concat(@(type), ' ', @(name))",
					"street": "=concat(@(type), ' ', @(name))"
				}
			}
      
      



. , "region": "=concat(@(type), ' ', @(name))",



: region, type name. region, @(type)



.





(. modify-overwrite-beta) - value.





"address": {
				"addressRF": {
					"value": "=concat(@(1,index), ', ', @(1,region), ', ', @(1,district), ', ', @(1,town), ', ', @(1,settlement), ', ', @(1,street), ', ', @(1,house), ', ', @(1,building), ', ', @(1,apartment))",
					"fias": null
				}
			}
      
      



, @(1,index)



. , index . .. value addressRF, addressRF index.





, =



concat



, @(1,index)



.





fias - .





. shift .





. "*"



. , .. director , .





"director": {
				"*": {
					"fl": {
						"fio": "=concat(@(1,lastName), ' ', @(1,firstName), ' ', @(1,patronymic))"
					}
				}
			}
      
      



shift

shift JSON.





JSON
{
  "reportDate" : "2020-05-20",
  "ogrn" : "1234567890123",
  "ogrnDate" : "2002-12-30",
  "inn" : "1234567890",
  "kpp" : "123456789",
  "opfCode" : "12300",
  "opfName" : "   ",
  "name" : {
    "fullName" : "   ",
    "shortName" : ""
  },
  "address" : {
    "addressRF" : {
      "region" : " ",
      "district" : " ",
      "town" : " ",
      "settlement" : " ",
      "street" : " ",
      "index" : "143500",
      "regionCode" : "50",
      "kladr" : "500000570000011",
      "house" : null,
      "building" : null,
      "apartment" : null,
      "value" : "143500,  ,  ,  ,  ,  , , , ",
      "fias" : null
    }
  },
  "termination" : null,
  "capital" : null,
  "manageOrg" : null,
  "director" : [ {
    "fl" : {
      "lastName" : "",
      "firstName" : "",
      "patronymic" : "",
      "inn" : "123456789012",
      "fio" : "  "
    },
    "position" : {
      "ogrnip" : null,
      "typeCode" : "02",
      "typeName" : "  ",
      "name" : " "
    },
    "disqualification" : null
  } ],
  "founders" : {
    "founderULRF" : null,
    "founderULForeign" : null,
    "founderFL" : [ {
      "fl" : {
        "lastName" : "",
        "firstName" : "",
        "patronymic" : "",
        "inn" : "123456789012",
        "fio" : "  "
      },
      "capitalPart" : {
        "nominal" : 20000,
        "size" : {
          "percent" : 50,
          "decimalPart" : null,
          "simplePart" : null
        }
      }
    }, {
      "fl" : {
        "lastName" : "",
        "firstName" : "",
        "patronymic" : "",
        "inn" : "123456789021",
        "fio" : "  "
      },
      "capitalPart" : {
        "nominal" : 20000,
        "size" : {
          "percent" : 50,
          "decimalPart" : null,
          "simplePart" : null
        }
      }
    } ],
    "founderGov" : null,
    "founderPIF" : null
  },
  "capitalPart" : null,
  "holderReestrAO" : null,
  "okved" : {
    "mainOkved" : {
      "code" : "47.11",
      "name" : "    ,  ,      "
    },
    "addOkved" : null
  }
}
      
      



, - , , , . , modify-overwrite-beta , . , shift - , .





shift
{
		"operation": "shift",
		"spec": {
			"reportDate|ogrn|ogrnDate|inn|kpp|opfCode|opfName": "&",
			"name": {
				"*": "&"
			},
			"address": {
				"addressRF": {
					"kladr|regionCode|value|fias": "&2.&"
				}
			},
			"termination": {
				"method": {
					"*": "&2.&"
				},
				"*": "&1.&"
			},
			"capital": "&",
			"manageOrg": {
				"egrulData": {
					"*": "&2.&"
				}
			},
			"director": {
				"*": {
					"fl": {
						"fio|inn": "&3[&2].&"
					},
					"position": {
						"name": "&3[&2].&1",
						"*": "&3[&2].&"
					},
					"disqualification": "&2[&1].&"
				}
			},
			"founders": {
				"founderULRF|founderULForeign": {
					"*": {
						"egrulData|foreignReg": {
							"*": "&4.&3[&2].&"
						},
						"*": "&3.&2[&1].&"
					}
				},
				"founderFL": {
					"*": {
						"fl": {
							"fio|inn": "&4.&3[&2].&"
						},
						"*": "&3.&2[&1].&"
					}
				},
				"founderGov": {
					"*": {
						"govOrg": {
							"*": "&4.&3[&2].&"
						},
						"capitalPart": "&3.&2[&1].&",
						"founderImplUL": {
							"egrulData": {
								"*": "&5.&4[&3].&2.&"
							}
						},
						"founderImplFL": {
							"fl": {
								"fio|inn": "&5.&4[&3].&2.&"
							}
						}
					}
				},
				"founderPIF": {
					"*": {
						"PIFName": {
							"name": "&4.&3[&2].&1"
						},
						"manageOrg": {
							"egrulData": {
								"*": "&5.&4[&3].&"
							}
						},
						"capitalPart": "&3.&2[&1].&"
					}
				}
			},
			"capitalPart": "&",
			"holderReestrAO": {
				"egrulData": {
					"*": "&2.&"
				}
			},
			"okved": "&"
		}
	}
      
      



shift . , , , . , . &



. , , &0



. . &1



, .. &



- , pre-&-post



. .. &



name, pre-name-post. . .





- "reportDate|ogrn|ogrnDate|inn|kpp|opfCode|opfName": "&"



. , . |



.





fullName shortName "name": { "*": "&" }



.

"*"



, , name



.

"&"



, .





- .





"address": {
				"addressRF": {
					"kladr|regionCode|value|fias": "&2.&"
				}
			}
      
      



. . - "&2.&"



. , , . &2



address, &



- . &1



addressRF, . .. : address.kladr, address.regionCode, address.value address.fias. JSON.









"director" : [ {
    "fl" : {
      "lastName" : "",
      "firstName" : "",
      "patronymic" : "",
      "inn" : "123456789012",
      "fio" : "  "
    },
    "position" : {
      "ogrnip" : null,
      "typeCode" : "02",
      "typeName" : "  ",
      "name" : " "
    },
    "disqualification" : null
  } ]
      
      



lastName, firstName patronymic.

inn fio .

ogrnip, typeCode typeName .

name position.

disqualification .





- , , - . , &



- [&]



.





"director": {
				"*": {
					"fl": {
						"fio|inn": "&3[&2].&"
					},
					"position": {
						"name": "&3[&2].&1",
						"*": "&3[&2].&"
					},
					"disqualification": "&2[&1].&"
				}
			}
      
      



, fio inn. &3[&2].&



. . : &3



- director, [&2]



- , &



- fio inn.





name position. &3



- director, [&2]



- , &1



- position. &



, name , position.





The rest of the elements in position are simply wrapped up one level. disqualification remains unchanged.





Further, similar constructions are used.





Example

And finally, I will duplicate the original JSON, JOLT specification and the resulting JSON





Raw JSON
{
  "reportDate": "2020-05-20",
  "ogrn": "1234567890123",
  "ogrnDate": "2002-12-30",
  "inn": "1234567890",
  "kpp": "123456789",
  "opfCode": "12300",
  "opfName": "   ",
  "name": {
    "fullName": "   ",
    "shortName": ""
  },
  "address": {
    "addressRF": {
      "region": {
        "type": "",
        "name": ""
      },
      "district": null,
      "town": {
        "type": "",
        "name": ""
      },
      "settlement": null,
      "street": {
        "type": "",
        "name": ""
      },
      "index": "143500",
      "regionCode": "50",
      "kladr": "500000570000011",
      "house": null,
      "building": null,
      "apartment": null
    }
  },
  "termination": null,
  "capital": null,
  "manageOrg": null,
  "director": [
    {
      "fl": {
        "lastName": "",
        "firstName": "",
        "patronymic": "",
        "inn": "123456789012"
      },
      "position": {
        "ogrnip": null,
        "typeCode": "02",
        "typeName": "  ",
        "name": " "
      },
      "disqualification": null
    }
  ],
  "founders": {
    "founderULRF": null,
    "founderULForeign": null,
    "founderFL": [
      {
        "fl": {
          "lastName": "",
          "firstName": "",
          "patronymic": "",
          "inn": "123456789012"
        },
        "capitalPart": {
          "nominal": 20000,
          "size": {
            "percent": 50,
            "decimalPart": null,
            "simplePart": null
          }
        }
      },
      {
        "fl": {
          "lastName": "",
          "firstName": "",
          "patronymic": "",
          "inn": "123456789021"
        },
        "capitalPart": {
          "nominal": 20000,
          "size": {
            "percent": 50,
            "decimalPart": null,
            "simplePart": null
          }
        }
      }
    ],
    "founderGov": null,
    "founderPIF": null
  },
  "capitalPart": null,
  "holderReestrAO": null,
  "okved": {
    "mainOkved": {
      "code": "47.11",
      "name": "    ,  ,      "
    },
    "addOkved": null
  }
}
      
      



JOLT specification
[
	{
		"operation": "modify-overwrite-beta",
		"spec": {
			"address": {
				"addressRF": {
					"region": "=concat(@(type), ' ', @(name))",
					"district": "=concat(@(type), ' ', @(name))",
					"town": "=concat(@(type), ' ', @(name))",
					"settlement": "=concat(@(type), ' ', @(name))",
					"street": "=concat(@(type), ' ', @(name))"
				}
			},
			"director": {
				"*": {
					"fl": {
						"fio": "=concat(@(1,lastName), ' ', @(1,firstName), ' ', @(1,patronymic))"
					}
				}
			},
			"founders": {
				"founderFL": {
					"*": {
						"fl": {
							"fio": "=concat(@(1,lastName), ' ', @(1,firstName), ' ', @(1,patronymic))"
						}
					}
				},
				"founderGov": {
					"*": {
						"founderImplFL": {
							"fl": {
								"fio": "=concat(@(1,lastName), ' ', @(1,firstName), ' ', @(1,patronymic))"
							}
						}
					}
				}
			}
		}
	},
	{
		"operation": "modify-overwrite-beta",
		"spec": {
			"address": {
				"addressRF": {
					"value": "=concat(@(1,index), ', ', @(1,region), ', ', @(1,district), ', ', @(1,town), ', ', @(1,settlement), ', ', @(1,street), ', ', @(1,house), ', ', @(1,building), ', ', @(1,apartment))",
					"fias": null
				}
			}
		}
	},
	{
		"operation": "shift",
		"spec": {
			"reportDate|ogrn|ogrnDate|inn|kpp|opfCode|opfName": "&",
			"name": {
				"*": "&"
			},
			"address": {
				"addressRF": {
					"kladr|regionCode|value|fias": "&2.&"
				}
			},
			"termination": {
				"method": {
					"*": "&2.&"
				},
				"*": "&1.&"
			},
			"capital": "&",
			"manageOrg": {
				"egrulData": {
					"*": "&2.&"
				}
			},
			"director": {
				"*": {
					"fl": {
						"fio|inn": "&3[&2].&"
					},
					"position": {
						"name": "&3[&2].&1",
						"*": "&3[&2].&"
					},
					"disqualification": "&2[&1].&"
				}
			},
			"founders": {
				"founderULRF|founderULForeign": {
					"*": {
						"egrulData|foreignReg": {
							"*": "&4.&3[&2].&"
						},
						"*": "&3.&2[&1].&"
					}
				},
				"founderFL": {
					"*": {
						"fl": {
							"fio|inn": "&4.&3[&2].&"
						},
						"*": "&3.&2[&1].&"
					}
				},
				"founderGov": {
					"*": {
						"govOrg": {
							"*": "&4.&3[&2].&"
						},
						"capitalPart": "&3.&2[&1].&",
						"founderImplUL": {
							"egrulData": {
								"*": "&5.&4[&3].&2.&"
							}
						},
						"founderImplFL": {
							"fl": {
								"fio|inn": "&5.&4[&3].&2.&"
							}
						}
					}
				},
				"founderPIF": {
					"*": {
						"PIFName": {
							"name": "&4.&3[&2].&1"
						},
						"manageOrg": {
							"egrulData": {
								"*": "&5.&4[&3].&"
							}
						},
						"capitalPart": "&3.&2[&1].&"
					}
				}
			},
			"capitalPart": "&",
			"holderReestrAO": {
				"egrulData": {
					"*": "&2.&"
				}
			},
			"okved": "&"
		}
	}
]

      
      



Resulting JSON
{
  "reportDate" : "2020-05-20",
  "ogrn" : "1234567890123",
  "ogrnDate" : "2002-12-30",
  "inn" : "1234567890",
  "kpp" : "123456789",
  "opfCode" : "12300",
  "opfName" : "   ",
  "fullName" : "   ",
  "shortName" : "",
  "address" : {
    "kladr" : "500000570000011",
    "regionCode" : "50",
    "value" : "143500,  ,  ,  ,  ,  , , , ",
    "fias" : null
  },
  "capital" : null,
  "director" : [ {
    "fio" : "  ",
    "inn" : "123456789012",
    "ogrnip" : null,
    "typeCode" : "02",
    "typeName" : "  ",
    "position" : " ",
    "disqualification" : null
  } ],
  "founders" : {
    "founderFL" : [ {
      "fio" : "  ",
      "inn" : "123456789012",
      "capitalPart" : {
        "nominal" : 20000,
        "size" : {
          "percent" : 50,
          "decimalPart" : null,
          "simplePart" : null
        }
      }
    }, {
      "fio" : "  ",
      "inn" : "123456789021",
      "capitalPart" : {
        "nominal" : 20000,
        "size" : {
          "percent" : 50,
          "decimalPart" : null,
          "simplePart" : null
        }
      }
    } ]
  },
  "capitalPart" : null,
  "okved" : {
    "mainOkved" : {
      "code" : "47.11",
      "name" : "    ,  ,      "
    },
    "addOkved" : null
  }
}
      
      



Further

Further, the resulting JSON should be placed somewhere for storage and further use. But that goes beyond the narrative. There is something convenient for someone.








All Articles