Skip to content

Commit

Permalink
#539 Add support for cp1364 and cp1388.
Browse files Browse the repository at this point in the history
Conversion tables contributed by [@BenceBenedek](https://github.com/BenceBenedek).
  • Loading branch information
yruslan committed Apr 5, 2023
1 parent aa7c898 commit 9492737
Show file tree
Hide file tree
Showing 11 changed files with 3,471 additions and 63 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -754,12 +754,14 @@ The following code pages are supported:
* `common_extended` - EBCDIC common characters with special characters extension
* `cp037` - IBM EBCDIC US-Canada
* `cp037_extended` - IBM EBCDIC US-Canada with special characters extension
* `cp300` - IBM EBCDIC Japanese Extended (2 byte code page)
* `cp838` - IBM EBCDIC Thailand
* `cp870` - IBM EBCDIC Multilingual Latin-2
* `cp875` - IBM EBCDIC Greek
* `cp1025` - IBM EBCDIC Multilingual Cyrillic
* `cp1047` - IBM EBCDIC Latin-1/Open System
* `cp00300` - (experimental support) IBM EBCDIC Japanese (Katakana) Extended (2 byte code page)
* `cp1364` - (experimental support) IBM EBCDIC Korean (2 byte code page)
* `cp1388` - (experimental support) IBM EBCDIC Simplified Chinese (2 byte code page)

By default, Cobrix uses common EBCDIC code page which contains only basic latin characters, numbers, and punctuation.
You can specify the code page to use for all string fields by setting the `ebcdic_code_page` option to one of the
Expand Down Expand Up @@ -1629,6 +1631,7 @@ A: Update hadoop dll to version 3.2.2 or newer.

## Changelog
- #### 2.6.5 (to be released soon)
- [#539](https://github.com/AbsaOSS/cobrix/issues/539) Fixed 'cp300', and added experimental support for 'cp1364' and 'cp1388' code pages (thanks [@BenceBenedek](https://github.com/BenceBenedek)).
- [#590](https://github.com/AbsaOSS/cobrix/issues/590) Changed from `.option("extended_metadata", true)` to `.option("metadata", "extended")` allowing other modes like 'basic' (default) and 'false' (disable metadata).
- [#593](https://github.com/AbsaOSS/cobrix/issues/593) Add option `.option("generate_record_bytes", true)` that adds a field containing raw bytes of each record decoded.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,24 @@ abstract class CodePage extends Serializable {

object CodePage extends Logging {

/**
* Code page names from: https://www.ibm.com/docs/en/zos-connect/zosconnect/3.0?topic=properties-coded-character-set-identifiers
*/
def getCodePageByName(codePageName: String): CodePage = {
codePageName match {
case "common" => new CodePageCommon
case "common_extended" => new CodePageCommonExt
case "cp037" => new CodePage037
case "cp037_extended" => new CodePage037Ext
case "cp00300" => new CodePage300 // This is the same as cp300
case "cp300" => new CodePage300
case "cp838" => new CodePage838
case "cp870" => new CodePage870
case "cp875" => new CodePage875
case "cp1025" => new CodePage1025
case "cp1047" => new CodePage1047
case "cp00300" => new CodePage00300
case "cp1364" => new CodePage1364
case "cp1388" => new CodePage1388
case codePage => throw new IllegalArgumentException(s"The code page '$codePage' is not one of the builtin EBCDIC code pages.")
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Copyright 2018 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package za.co.absa.cobrix.cobol.parser.encoding.codepage

import za.co.absa.cobrix.cobol.parser.encoding.codepage.TwoByteCodePage.createEbcdicToUnicodeTable

/**
* EBCDIC code page CCSID-1364 (Korean).
*/
class CodePage1364 extends TwoByteCodePage(CodePage1364.ebcdicToAsciiMapping) {
override def codePageShortName: String = "cp1364"
}

object CodePage1364 {
val ebcdicToAsciiMapping: Array[Char] = {
/**
* This is the EBCDIC Code Page 1364 contributed by https://github.com/BenceBenedek
* https://www.ibm.com/docs/en/i/7.3?topic=reference-ccsid-values
*/
createEbcdicToUnicodeTable(TwoByteTables1364.mappingTableEbcdic1364(), TwoByteTables1364.mappingTableUnicode1364())
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Copyright 2018 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package za.co.absa.cobrix.cobol.parser.encoding.codepage

import za.co.absa.cobrix.cobol.parser.encoding.codepage.TwoByteCodePage.createEbcdicToUnicodeTable

/**
* EBCDIC code page CCSID-1388 (Simplified Chinese)
*/
class CodePage1388 extends TwoByteCodePage(CodePage1388.ebcdicToAsciiMapping) {
override def codePageShortName: String = "cp1388"
}

object CodePage1388 {
val ebcdicToAsciiMapping: Array[Char] = {
/**
* This is the EBCDIC Code Page 1388 contributed by https://github.com/BenceBenedek
* https://www.ibm.com/docs/en/i/7.3?topic=reference-ccsid-values
*/
createEbcdicToUnicodeTable(TwoByteTables1388.mappingTableEbcdic1388(), TwoByteTables1388.mappingTableUnicode1388())
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,21 @@

package za.co.absa.cobrix.cobol.parser.encoding.codepage

import za.co.absa.cobrix.cobol.parser.encoding.codepage.TwoByteCodePage.createEbcdicToUnicodeTable

/**
* EBCDIC code page 300 Japanese Latin Host Double-Byte.
*/
class CodePage00300 extends TwoByteCodePage(CodePage00300.ebcdicToAsciiMapping) {
override def codePageShortName: String = "cp00300"
class CodePage300 extends TwoByteCodePage(CodePage300.ebcdicToAsciiMapping) {
override def codePageShortName: String = "cp300"
}

object CodePage00300 {
object CodePage300 {
val ebcdicToAsciiMapping: Array[Char] = {
/**
* This is the EBCDIC Code Page 00300 contributed by https://github.com/BenceBenedek
* This is the EBCDIC Code Page CCSID-00300 contributed by https://github.com/BenceBenedek
* https://public.dhe.ibm.com/software/globalization/gcoc/attachments/CP00300.pdf
*/
val ebcdic2ascii: Array[Char] = {
val directMapping = new Array[Char](65536)

val ebcdic300 = TwoByteTables.mappingTableEbcdic300()
val unicode300 = TwoByteTables.mappingTableUnicode300()

var i = 0
val len = ebcdic300.length
while (i < len) {
val unicode = unicode300(i)
val ebcdic = ebcdic300(i)
directMapping(ebcdic) = unicode.toChar
i += 1
}
directMapping
}


ebcdic2ascii
createEbcdicToUnicodeTable(TwoByteTables300.mappingTableEbcdic300(), TwoByteTables300.mappingTableUnicode300())
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,19 @@ abstract class TwoByteCodePage(ebcdicToAsciiMapping: Array[Char]) extends CodePa
buf.toString
}
}

object TwoByteCodePage {
def createEbcdicToUnicodeTable(ebcdicTable: Array[Int], unicodeTable: Array[Int]): Array[Char] = {
val directMapping = new Array[Char](65536)

var i = 0
val len = ebcdicTable.length
while (i < len) {
val unicode = unicodeTable(i)
val ebcdic = ebcdicTable(i)
directMapping(ebcdic) = unicode.toChar
i += 1
}
directMapping
}
}

This file was deleted.

Loading

0 comments on commit 9492737

Please sign in to comment.