From c5d8880d6419e48b5c1450a5c1236576a47d2ac8 Mon Sep 17 00:00:00 2001 From: Raghuram Subramani <raghus2247@gmail.com> Date: Mon, 14 Apr 2025 11:26:33 +0530 Subject: [PATCH] add translate.py --- scrape_ecourtindia_v6/translate/translate.py | 283 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) diff --git a/scrape_ecourtindia_v6/translate/translate.py b/scrape_ecourtindia_v6/translate/translate.py new file mode 100644 index 0000000..3c6b53a 100644 --- /dev/null +++ a/scrape_ecourtindia_v6/translate/translate.py @@ -1,0 +1,283 @@ +import re +import fitz + +k2u = [ + ('\xf1', '\u0970'), # ñ -> ॰ + ('Q+Z', 'QZ+'), # Q+Z -> QZ+ + ('sas', 'sa'), # sas -> sa + ('aa', 'a'), # aa -> a + (')Z', '\u0930\u094d\u0926\u094d\u0927'), # )Z -> रà¥à¤¦à¥à¤§ + ('ZZ', 'Z'), # ZZ -> Z + ('\u2018', '"'), # ‘ -> " + ('\u2019', '"'), # ’ -> " + ('\u201c', u"'"), # “ -> ' + ('\u201d', u"'"), # †-> ' + ('\xe5', '\u0966'), # Ã¥ -> ० + ('\u0192', '\u0967'), # Æ’ -> १ + ('\u201e', '\u0968'), # „ -> २ + ('\u2026', '\u0969'), # … -> ३ + ('\u2020', '\u096a'), # †-> ४ + ('\u2021', '\u096b'), # ‡ -> ५ + ('\u02c6', '\u096c'), # ˆ -> ६ + ('\u2030', '\u096d'), # ‰ -> ॠ+ ('\u0160', '\u096e'), # Å -> ८ + ('\u2039', '\u096f'), # ‹ -> ९ + ('\xb6+', '\u095e\u094d'), # ¶+ -> फ़ॠ+ ('d+', '\u0958'), # d+ -> क़ + ('[+k', '\u0959'), # [+k -> ख़ + ('[+', '\u0959\u094d'), # [+ -> ख़ॠ+ ('x+', '\u095a'), # x+ -> ग़ + ('T+', '\u091c\u093c\u094d'), # T+ -> ज़ॠ+ ('t+', '\u095b'), # t+ -> ज़ + ('M+', '\u095c'), # M+ -> ड़ + ('<+', '\u095d'), # <+ -> ॠ+ ('Q+', '\u095e'), # Q+ -> फ़ + (';+', '\u095f'), # ;+ -> य़ + ('j+', '\u0931'), # j+ -> ऱ + ('u+', '\u0929'), # u+ -> ऩ + ('\xd9k', '\u0924\u094d\u0924'), # Ùk -> तà¥à¤¤ + ('\xd9', '\u0924\u094d\u0924\u094d'), # Ù -> तà¥à¤¤à¥ + ('\xe4', '\u0915\u094d\u0924'), # ä -> कà¥à¤¤ + ('\u2013', '\u0926\u0943'), # – -> दृ + ('\u2014', '\u0915\u0943'), # — -> कृ + ('\xe9', '\u0928\u094d\u0928'), # é -> नà¥à¤¨ + ('\u2122', '\u0928\u094d\u0928\u094d'), # â„¢ -> नà¥à¤¨à¥ + ('=kk', '=k'), # =kk -> =k + ('f=k', 'f='), # f=k -> f= + ('\xe0', '\u0939\u094d\u0928'), # à -> हà¥à¤¨ + ('\xe1', '\u0939\u094d\u092f'), # á -> हà¥à¤¯ + ('\xe2', '\u0939\u0943'), # â -> हृ + ('\xe3', '\u0939\u094d\u092e'), # ã -> हà¥à¤® + ('\xbaz', '\u0939\u094d\u0930'), # ºz -> हà¥à¤° + ('\xba', '\u0939\u094d'), # º -> हॠ+ ('\xed', '\u0926\u094d\u0926'), # à -> दà¥à¤¦ + ('{k', '\u0915\u094d\u0937'), # {k -> कà¥à¤· + ('{', '\u0915\u094d\u0937\u094d'), # { -> कà¥à¤·à¥ + ('=', '\u0924\u094d\u0930'), # = -> तà¥à¤° + ('\xab', '\u0924\u094d\u0930\u094d'), # « -> तà¥à¤°à¥ + ('N\xee', '\u091b\u094d\u092f'), # Nî -> छà¥à¤¯ + ('V\xee', '\u091f\u094d\u092f'), # Vî -> टà¥à¤¯ + ('B\xee', '\u0920\u094d\u092f'), # Bî -> ठà¥à¤¯ + ('M\xee', '\u0921\u094d\u092f'), # Mî -> डà¥à¤¯ + ('<\xee', '\u0922\u094d\u092f'), # <î -> ढà¥à¤¯ + ('|', '\u0926\u094d\u092f'), # | -> दà¥à¤¯ + ('K', '\u091c\u094d\u091e'), # K -> जà¥à¤ž + ('}', '\u0926\u094d\u0935'), # } -> दà¥à¤µ + ('J', '\u0936\u094d\u0930'), # J -> शà¥à¤° + ('V\xaa', '\u091f\u094d\u0930'), # Vª -> टà¥à¤° + ('M\xaa', '\u0921\u094d\u0930'), # Mª -> डà¥à¤° + ('<\xaa\xaa', '\u0922\u094d\u0930'), # <ªª -> ढà¥à¤° + ('N\xaa', '\u091b\u094d\u0930'), # Nª -> छà¥à¤° + ('\xd8', '\u0915\u094d\u0930'), # Ø -> कà¥à¤° + ('\xdd', '\u092b\u094d\u0930'), # à -> फà¥à¤° + ('nzZ', '\u0930\u094d\u0926\u094d\u0930'), # nzZ -> रà¥à¤¦à¥à¤° + ('\xe6', '\u0926\u094d\u0930'), # æ -> दà¥à¤° + ('\xe7', '\u092a\u094d\u0930'), # ç -> पà¥à¤° + ('\xc1', '\u092a\u094d\u0930'), # à -> पà¥à¤° + ('xz', '\u0917\u094d\u0930'), # xz -> गà¥à¤° + ('#', '\u0930\u0941'), # # -> रॠ+ (':', '\u0930\u0942'), # : -> रू + ('v\u201a', '\u0911'), # v‚ -> ऑ + ('vks', '\u0913'), # vks -> ओ + ('vkS', '\u0914'), # vkS -> औ + ('vk', '\u0906'), # vk -> आ + ('v', '\u0905'), # v -> अ + ('b\xb1', '\u0908\u0902'), # b± -> ईं + ('\xc3', '\u0908'), # à -> ई + ('bZ', '\u0908'), # bZ -> ई + ('b', '\u0907'), # b -> इ + ('m', '\u0909'), # m -> उ + ('\xc5', '\u090a'), # Ã… -> ऊ + (',s', '\u0910'), # ,s -> ठ+ (',', '\u090f'), # , -> ठ+ ('_', '\u090b'), # _ -> ऋ + ('\xf4', '\u0915\u094d\u0915'), # ô -> कà¥à¤• + ('d', '\u0915'), # d -> क + ('Dk', '\u0915'), # Dk -> क + ('D', '\u0915\u094d'), # D -> कॠ+ ('[k', '\u0916'), # [k -> ख + ('[', '\u0916\u094d'), # [ -> खॠ+ ('x', '\u0917'), # x -> ग + ('Xk', '\u0917'), # Xk -> ग + ('X', '\u0917\u094d'), # X -> गॠ+ ('\xc4', '\u0918'), # Ä -> घ + ('?k', '\u0918'), # ?k -> घ + ('?', '\u0918\u094d'), # ? -> घॠ+ ('\xb3', '\u0919'), # ³ -> ङ + ('pkS', '\u091a\u0948'), # pkS -> चै + ('p', '\u091a'), # p -> च + ('Pk', '\u091a'), # Pk -> च + ('P', '\u091a\u094d'), # P -> चॠ+ ('N', '\u091b'), # N -> छ + ('t', '\u091c'), # t -> ज + ('Tk', '\u091c'), # Tk -> ज + ('T', '\u091c\u094d'), # T -> जॠ+ ('>', '\u091d'), # > -> ठ+ ('\xf7', '\u091d\u094d'), # ÷ -> à¤à¥ + ('\xa5', '\u091e'), # Â¥ -> ञ + ('\xea', '\u091f\u094d\u091f'), # ê -> टà¥à¤Ÿ + ('\xeb', '\u091f\u094d\u0920'), # ë -> टà¥à¤ + ('V', '\u091f'), # V -> ट + ('B', '\u0920'), # B -> ठ+ ('\xec', '\u0921\u094d\u0921'), # ì -> डà¥à¤¡ + ('\xef', '\u0921\u094d\u0922'), # ï -> डà¥à¤¢ + ('M+', '\u0921\u093c'), # M+ -> ड़ + ('<+', '\u0922\u093c'), # <+ -> ढ़ + ('M', '\u0921'), # M -> ड + ('<', '\u0922'), # < -> ढ + ('.k', '\u0923'), # .k -> ण + ('.', '\u0923\u094d'), # . -> णॠ+ ('r', '\u0924'), # r -> त + ('Rk', '\u0924'), # Rk -> त + ('R', '\u0924\u094d'), # R -> तॠ+ ('Fk', '\u0925'), # Fk -> थ + ('F', '\u0925\u094d'), # F -> थॠ+ (')', '\u0926\u094d\u0927'), # ) -> दà¥à¤§ + ('n', '\u0926'), # n -> द + ('/k', '\u0927'), # /k -> ध +# ('\xe8k', '\u0927'), # èk -> ध + ('/', '\u0927\u094d'), # / -> धॠ+ ('\xcb', '\u0927\u094d'), # Ë -> धॠ+# ('\xe8', '\u0927\u094d'), # è -> धॠ+ ('\xe8', '\u0927'), # è -> ध + ('u', '\u0928'), # u -> न + ('Uk', '\u0928'), # Uk -> न + ('U', '\u0928\u094d'), # U -> नॠ+ ('i', '\u092a'), # i -> प + ('Ik', '\u092a'), # Ik -> प + ('I', '\u092a\u094d'), # I -> पॠ+ ('Q', '\u092b'), # Q -> फ + ('\xb6', '\u092b\u094d'), # ¶ -> फॠ+ ('c', '\u092c'), # c -> ब + ('Ck', '\u092c'), # Ck -> ब + ('C', '\u092c\u094d'), # C -> बॠ+ ('Hk', '\u092d'), # Hk -> ठ+ ('H', '\u092d\u094d'), # H -> à¤à¥ + ('e', '\u092e'), # e -> म + ('Ek', '\u092e'), # Ek -> म + ('E', '\u092e\u094d'), # E -> मॠ+ (';', '\u092f'), # ; -> य + ('\xb8', '\u092f\u094d'), # ¸ -> यॠ+ ('j', '\u0930'), # j -> र + ('y', '\u0932'), # y -> ल + ('Yk', '\u0932'), # Yk -> ल + ('Y', '\u0932\u094d'), # Y -> लॠ+ ('G', '\u0933'), # G -> ळ + ('o', '\u0935'), # o -> व + ('Ok', '\u0935'), # Ok -> व + ('O', '\u0935\u094d'), # O -> वॠ+ (u"'k", '\u0936'), # 'k -> श + (u"'", '\u0936\u094d'), # ' -> शॠ+ ('"k', '\u0937'), # "k -> ष + ('"', '\u0937\u094d'), # " -> षॠ+ ('l', '\u0938'), # l -> स + ('Lk', '\u0938'), # Lk -> स + ('L', '\u0938\u094d'), # L -> सॠ+ ('g', '\u0939'), # g -> ह + ('\xc8', '\u0940\u0902'), # È -> ीं + ('saz', '\u094d\u0930\u0947\u0902'), # saz -> à¥à¤°à¥‡à¤‚ + ('z', '\u094d\u0930'), # z -> à¥à¤° + ('\xcc', '\u0926\u094d\u0926'), # ÃŒ -> दà¥à¤¦ + ('\xcd', '\u091f\u094d\u091f'), # à -> टà¥à¤Ÿ + ('\xce', '\u091f\u094d\u0920'), # ÃŽ -> टà¥à¤ + ('\xcf', '\u0921\u094d\u0921'), # à -> डà¥à¤¡ + ('\xd1', '\u0915\u0943'), # Ñ -> कृ + ('\xd2', '\u092d'), # Ã’ -> ठ+ ('\xd3', '\u094d\u092f'), # Ó -> à¥à¤¯ + ('\xd4', '\u0921\u094d\u0922'), # Ô -> डà¥à¤¢ + ('\xd6', '\u091d\u094d'), # Ö -> à¤à¥ + ('\xd8', '\u0915\u094d\u0930'), # Ø -> कà¥à¤° + ('\xd9', '\u0924\u094d\u0924\u094d'), # Ù -> तà¥à¤¤à¥ + ('\xdck', '\u0936'), # Ük -> श + ('\xdc', '\u0936\u094d'), # Ü -> शॠ+ ('\u201a', '\u0949'), # ‚ -> ॉ + ('kas', '\u094b\u0902'), # kas -> ों + ('ks', '\u094b'), # ks -> ो + ('kS', '\u094c'), # kS -> ौ + ('\xa1k', '\u093e\u0901'), # ¡k -> ाà¤' + ('ak', 'k\u0902'), # ak -> k + ं + ('k', '\u093e'), # k -> ा + ('ah', '\u0940\u0902'), # ah -> ीं + ('h', '\u0940'), # h -> ी + ('aq', '\u0941\u0902'), # aq -> à¥à¤‚ + ('q', '\u0941'), # q -> ॠ+ ('aw', '\u0942\u0902'), # aw -> ूं + ('\xa1w', '\u0942\u0901'), # ¡w -> ूठ+ ('w', '\u0942'), # w -> ू + ('`', '\u0943'), # ` -> ृ + ('\u0300', '\u0943'), # Ì€ -> ृ + ('as', '\u0947\u0902'), # as -> ें + ('\xb1s', 's\xb1'), # ±s -> s± + ('s', '\u0947'), # s -> े + ('aS', '\u0948\u0902'), # aS -> ैं + ('S', '\u0948'), # S -> ै + ('a\xaa', '\u094d\u0930\u0902'), # aª -> à¥à¤° + ं + ('\xaa', '\u094d\u0930'), # ª -> à¥à¤° + ('fa', '\u0902f'), # fa -> ं + f + ('a', '\u0902'), # a -> ं + ('\xa1', '\u0901'), # ¡ -> ठ+ ('%', ':'), # % -> : + ('W', '\u0945'), # W -> ॅ + ('\u2022', '\u093d'), # • -> ऽ + ('\xb7', '\u093d'), # · -> ऽ + ('\u2219', '\u093d'), # ∙ -> ऽ + ('\xb7', '\u093d'), # · -> ऽ + ('~j', '\u094d\u0930'), # ~j -> à¥à¤° + ('~', '\u094d'), # ~ -> ॠ+ ('\\', '?'), # \ -> ? + ('+', '\u093c'), # + -> ़ + ('^', '\u2018'), # ^ -> ‘ + ('*', '\u2019'), # * -> ’ + ('\xde', '\u201c'), # Þ -> “ + ('\xdf', '\u201d'), # ß -> †+ ('(', ';'), # ( -> ; + ('\xbc', '('), # ¼ -> ( + ('\xbd', ')'), # ½ -> ) + ('\xbf', '{'), # ¿ -> { + ('\xc0', '}'), # À -> } + ('\xbe', '='), # ¾ -> = + ('A', '\u0964'), # A -> । + ('-', '.'), # - -> . + ('&', '-'), # & -> - + ('&', '\xb5'), # & -> µ + ('\u03bc', '-'), # μ -> - + ('\u0152', '\u0970'), # Å’ -> ॰ + (']', ','), # ] -> , + ('~ ', '\u094d '), # ~ -> ॠ+ ('@', '/'), # @ -> / + ('\xae', '\u0948\u0902'), # ® -> ैं +# ('%', '\u0903'), # % -> ः +# (' \u0903', ':'), # ः -> : +# ('\xc7', '\u093f\u0902'), # Ç -> िं +# ('\xca', '\u0940Z'), # Ê -> ीZ +# ('Z', '\u0930\u094d'), # Z -> रॠ+# ('f', '\u093f'), # f -> ि +# ('\xb1', 'Z\u0902'), # ± -> Zं +# ('\xc6', '\u0930\u094d\u093f'), # Æ -> रà¥à¤¿ +# ('\xc9', '\u0930\u094d\u093f\u0902'), # É -> रà¥à¤¿' +] + +def convert_text(text): + for src, tgt in k2u: + text = text.replace(src, tgt) + return text + +def extract_text_from_pdf(pdf_path): + doc = fitz.open(pdf_path) + full_text = "" + for page in doc: + full_text += page.get_text() + return full_text + +def convert_pdf(pdf_path, output_path): + text = extract_text_from_pdf(pdf_path) + converted = convert_text(text) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(converted) + +if __name__ == "__main__": + import sys + if len(sys.argv) != 3: + print(f"Usage: python {sys.argv[0]} input.pdf output.txt") + else: + convert_pdf(sys.argv[1], sys.argv[2]) -- rgit 0.1.5