🏡 index : github.com/compromyse/enfold.git

import re
import fitz

k2u = [
   ('\xf1', '\u0970'),  #  ñ  ->  ॰
   ('Q+Z', 'QZ+'),  #  Q+Z  ->  QZ+
   ('sas', 'sa'),  #  sas  ->  sa
   ('aa', 'a'),  #  aa  ->  a
   (')Z', '\u0930\u094d\u0926\u094d\u0927'),  #  )Z  ->  र्द्ध
   ('ZZ', 'Z'),  #  ZZ  ->  Z
   ('\u2018', '"'),  #  ‘  ->  "
   ('\u2019', '"'),  #  ’  ->  "
   ('\u201c', u"'"),  #  “  ->  '
   ('\u201d', u"'"),  #  ”  ->  '
   ('\xe5', '\u0966'),  #  å  ->  ०
   ('\u0192', '\u0967'),  #  ƒ  ->  १
   ('\u201e', '\u0968'),  #  „  ->  २
   ('\u2026', '\u0969'),  #  …  ->  ३
   ('\u2020', '\u096a'),  #  †  ->  ४
   ('\u2021', '\u096b'),  #  ‡  ->  ५
   ('\u02c6', '\u096c'),  #  ˆ  ->  ६
   ('\u2030', '\u096d'),  #  ‰  ->  ७
   ('\u0160', '\u096e'),  #  Š  ->  ८
   ('\u2039', '\u096f'),  #  ‹  ->  ९
   ('\xb6+', '\u095e\u094d'),  #  ¶+  ->  फ़्
   ('d+', '\u0958'),  #  d+  ->  क़
   ('[+k', '\u0959'),  #  [+k  ->  ख़
   ('[+', '\u0959\u094d'),  #  [+  ->  ख़्
   ('x+', '\u095a'),  #  x+  ->  ग़
   ('T+', '\u091c\u093c\u094d'),  #  T+  ->  ज़्
   ('t+', '\u095b'),  #  t+  ->  ज़
   ('M+', '\u095c'),  #  M+  ->  ड़
   ('<+', '\u095d'),  #  <+  ->  ढ़
   ('Q+', '\u095e'),  #  Q+  ->  फ़
   (';+', '\u095f'),  #  ;+  ->  य़
   ('j+', '\u0931'),  #  j+  ->  ऱ
   ('u+', '\u0929'),  #  u+  ->  ऩ
   ('\xd9k', '\u0924\u094d\u0924'),  #  Ùk  ->  त्त
   ('\xd9', '\u0924\u094d\u0924\u094d'),  #  Ù  ->  त्त्
   ('\xe4', '\u0915\u094d\u0924'),  #  ä  ->  क्त
   ('\u2013', '\u0926\u0943'),  #  –  ->  दृ
   ('\u2014', '\u0915\u0943'),  #  —  ->  कृ
   ('\xe9', '\u0928\u094d\u0928'),  #  é  ->  न्न
   ('\u2122', '\u0928\u094d\u0928\u094d'),  #  ™  ->  न्न्
   ('=kk', '=k'),  #  =kk  ->  =k
   ('f=k', 'f='),  #  f=k  ->  f=
   ('\xe0', '\u0939\u094d\u0928'),  #  à  ->  ह्न
   ('\xe1', '\u0939\u094d\u092f'),  #  á  ->  ह्य
   ('\xe2', '\u0939\u0943'),  #  â  ->  हृ
   ('\xe3', '\u0939\u094d\u092e'),  #  ã  ->  ह्म
   ('\xbaz', '\u0939\u094d\u0930'),  #  ºz  ->  ह्र
   ('\xba', '\u0939\u094d'),  #  º  ->  ह्
   ('\xed', '\u0926\u094d\u0926'),  #  í  ->  द्द
   ('{k', '\u0915\u094d\u0937'),  #  {k  ->  क्ष
   ('{', '\u0915\u094d\u0937\u094d'),  #  {  ->  क्ष्
   ('=', '\u0924\u094d\u0930'),  #  =  ->  त्र
   ('\xab', '\u0924\u094d\u0930\u094d'),  #  «  ->  त्र्
   ('N\xee', '\u091b\u094d\u092f'),  #  Nî  ->  छ्य
   ('V\xee', '\u091f\u094d\u092f'),  #  Vî  ->  ट्य
   ('B\xee', '\u0920\u094d\u092f'),  #  Bî  ->  ठ्य
   ('M\xee', '\u0921\u094d\u092f'),  #  Mî  ->  ड्य
   ('<\xee', '\u0922\u094d\u092f'),  #  <î  ->  ढ्य
   ('|', '\u0926\u094d\u092f'),  #  |  ->  द्य
   ('K', '\u091c\u094d\u091e'),  #  K  ->  ज्ञ
   ('}', '\u0926\u094d\u0935'),  #  }  ->  द्व
   ('J', '\u0936\u094d\u0930'),  #  J  ->  श्र
   ('V\xaa', '\u091f\u094d\u0930'),  #  Vª  ->  ट्र
   ('M\xaa', '\u0921\u094d\u0930'),  #  Mª  ->  ड्र
   ('<\xaa\xaa', '\u0922\u094d\u0930'),  #  <ªª  ->  ढ्र
   ('N\xaa', '\u091b\u094d\u0930'),  #  Nª  ->  छ्र
   ('\xd8', '\u0915\u094d\u0930'),  #  Ø  ->  क्र
   ('\xdd', '\u092b\u094d\u0930'),  #  Ý  ->  फ्र
   ('nzZ', '\u0930\u094d\u0926\u094d\u0930'),  #  nzZ  ->  र्द्र
   ('\xe6', '\u0926\u094d\u0930'),  #  æ  ->  द्र
   ('\xe7', '\u092a\u094d\u0930'),  #  ç  ->  प्र
   ('\xc1', '\u092a\u094d\u0930'),  #  Á  ->  प्र
   ('xz', '\u0917\u094d\u0930'),  #  xz  ->  ग्र
   ('#', '\u0930\u0941'),  #  #  ->  रु
   (':', '\u0930\u0942'),  #  :  ->  रू
   ('v\u201a', '\u0911'),  #  v‚  ->  ऑ
   ('vks', '\u0913'),  #  vks  ->  ओ
   ('vkS', '\u0914'),  #  vkS  ->  औ
   ('vk', '\u0906'),  #  vk  ->  आ
   ('v', '\u0905'),  #  v  ->  अ
   ('b\xb1', '\u0908\u0902'),  #  b±  ->  ईं
   ('\xc3', '\u0908'),  #  Ã  ->  ई
   ('bZ', '\u0908'),  #  bZ  ->  ई
   ('b', '\u0907'),  #  b  ->  इ
   ('m', '\u0909'),  #  m  ->  उ
   ('\xc5', '\u090a'),  #  Å  ->  ऊ
   (',s', '\u0910'),  #  ,s  ->  ऐ
   (',', '\u090f'),  #  ,  ->  ए
   ('_', '\u090b'),  #  _  ->  ऋ
   ('\xf4', '\u0915\u094d\u0915'),  #  ô  ->  क्क
   ('d', '\u0915'),  #  d  ->  क
   ('Dk', '\u0915'),  #  Dk  ->  क
   ('D', '\u0915\u094d'),  #  D  ->  क्
   ('[k', '\u0916'),  #  [k  ->  ख
   ('[', '\u0916\u094d'),  #  [  ->  ख्
   ('x', '\u0917'),  #  x  ->  ग
   ('Xk', '\u0917'),  #  Xk  ->  ग
   ('X', '\u0917\u094d'),  #  X  ->  ग्
   ('\xc4', '\u0918'),  #  Ä  ->  घ
   ('?k', '\u0918'),  #  ?k  ->  घ
   ('?', '\u0918\u094d'),  #  ?  ->  घ्
   ('\xb3', '\u0919'),  #  ³  ->  ङ
   ('pkS', '\u091a\u0948'),  #  pkS  ->  चै
   ('p', '\u091a'),  #  p  ->  च
   ('Pk', '\u091a'),  #  Pk  ->  च
   ('P', '\u091a\u094d'),  #  P  ->  च्
   ('N', '\u091b'),  #  N  ->  छ
   ('t', '\u091c'),  #  t  ->  ज
   ('Tk', '\u091c'),  #  Tk  ->  ज
   ('T', '\u091c\u094d'),  #  T  ->  ज्
   ('>', '\u091d'),  #  >  ->  झ
   ('\xf7', '\u091d\u094d'),  #  ÷  ->  झ्
   ('\xa5', '\u091e'),  #  ¥  ->  ञ
   ('\xea', '\u091f\u094d\u091f'),  #  ê  ->  ट्ट
   ('\xeb', '\u091f\u094d\u0920'),  #  ë  ->  ट्ठ
   ('V', '\u091f'),  #  V  ->  ट
   ('B', '\u0920'),  #  B  ->  ठ
   ('\xec', '\u0921\u094d\u0921'),  #  ì  ->  ड्ड
   ('\xef', '\u0921\u094d\u0922'),  #  ï  ->  ड्ढ
   ('M+', '\u0921\u093c'),  #  M+  ->  ड़
   ('<+', '\u0922\u093c'),  #  <+  ->  ढ़
   ('M', '\u0921'),  #  M  ->  ड
   ('<', '\u0922'),  #  <  ->  ढ
   ('.k', '\u0923'),  #  .k  ->  ण
   ('.', '\u0923\u094d'),  #  .  ->  ण्
   ('r', '\u0924'),  #  r  ->  त
   ('Rk', '\u0924'),  #  Rk  ->  त
   ('R', '\u0924\u094d'),  #  R  ->  त्
   ('Fk', '\u0925'),  #  Fk  ->  थ
   ('F', '\u0925\u094d'),  #  F  ->  थ्
   (')', '\u0926\u094d\u0927'),  #  )  ->  द्ध
   ('n', '\u0926'),  #  n  ->  द
   ('/k', '\u0927'),  #  /k  ->  ध
#   ('\xe8k', '\u0927'),  #  èk  ->  ध
   ('/', '\u0927\u094d'),  #  /  ->  ध्
   ('\xcb', '\u0927\u094d'),  #  Ë  ->  ध्
#   ('\xe8', '\u0927\u094d'),  #  è  ->  ध्
   ('\xe8', '\u0927'),  #  è  ->  ध
   ('u', '\u0928'),  #  u  ->  न
   ('Uk', '\u0928'),  #  Uk  ->  न
   ('U', '\u0928\u094d'),  #  U  ->  न्
   ('i', '\u092a'),  #  i  ->  प
   ('Ik', '\u092a'),  #  Ik  ->  प
   ('I', '\u092a\u094d'),  #  I  ->  प्
   ('Q', '\u092b'),  #  Q  ->  फ
   ('\xb6', '\u092b\u094d'),  #  ¶  ->  फ्
   ('c', '\u092c'),  #  c  ->  ब
   ('Ck', '\u092c'),  #  Ck  ->  ब
   ('C', '\u092c\u094d'),  #  C  ->  ब्
   ('Hk', '\u092d'),  #  Hk  ->  भ
   ('H', '\u092d\u094d'),  #  H  ->  भ्
   ('e', '\u092e'),  #  e  ->  म
   ('Ek', '\u092e'),  #  Ek  ->  म
   ('E', '\u092e\u094d'),  #  E  ->  म्
   (';', '\u092f'),  #  ;  ->  य
   ('\xb8', '\u092f\u094d'),  #  ¸  ->  य्
   ('j', '\u0930'),  #  j  ->  र
   ('y', '\u0932'),  #  y  ->  ल
   ('Yk', '\u0932'),  #  Yk  ->  ल
   ('Y', '\u0932\u094d'),  #  Y  ->  ल्
   ('G', '\u0933'),  #  G  ->  ळ
   ('o', '\u0935'),  #  o  ->  व
   ('Ok', '\u0935'),  #  Ok  ->  व
   ('O', '\u0935\u094d'),  #  O  ->  व्
   (u"'k", '\u0936'),  #  'k  ->  श
   (u"'", '\u0936\u094d'),  #  '  ->  श्
   ('"k', '\u0937'),  #  "k  ->  ष
   ('"', '\u0937\u094d'),  #  "  ->  ष्
   ('l', '\u0938'),  #  l  ->  स
   ('Lk', '\u0938'),  #  Lk  ->  स
   ('L', '\u0938\u094d'),  #  L  ->  स्
   ('g', '\u0939'),  #  g  ->  ह
   ('\xc8', '\u0940\u0902'),  #  È  ->  ीं
   ('saz', '\u094d\u0930\u0947\u0902'),  #  saz  ->  ्रें
   ('z', '\u094d\u0930'),  #  z  ->  ्र
   ('\xcc', '\u0926\u094d\u0926'),  #  Ì  ->  द्द
   ('\xcd', '\u091f\u094d\u091f'),  #  Í  ->  ट्ट
   ('\xce', '\u091f\u094d\u0920'),  #  Î  ->  ट्ठ
   ('\xcf', '\u0921\u094d\u0921'),  #  Ï  ->  ड्ड
   ('\xd1', '\u0915\u0943'),  #  Ñ  ->  कृ
   ('\xd2', '\u092d'),  #  Ò  ->  भ
   ('\xd3', '\u094d\u092f'),  #  Ó  ->  ्य
   ('\xd4', '\u0921\u094d\u0922'),  #  Ô  ->  ड्ढ
   ('\xd6', '\u091d\u094d'),  #  Ö  ->  झ्
   ('\xd8', '\u0915\u094d\u0930'),  #  Ø  ->  क्र
   ('\xd9', '\u0924\u094d\u0924\u094d'),  #  Ù  ->  त्त्
   ('\xdck', '\u0936'),  #  Ük  ->  श
   ('\xdc', '\u0936\u094d'),  #  Ü  ->  श्
   ('\u201a', '\u0949'),  #  ‚  ->  ॉ
   ('kas', '\u094b\u0902'),  #  kas  ->  ों
   ('ks', '\u094b'),  #  ks  ->  ो
   ('kS', '\u094c'),  #  kS  ->  ौ
   ('\xa1k', '\u093e\u0901'),  #  ¡k  ->  ाँ'
   ('ak', 'k\u0902'),  #  ak  ->  k +  ं
   ('k', '\u093e'),  #  k  ->  ा
   ('ah', '\u0940\u0902'),  #  ah  ->  ीं
   ('h', '\u0940'),  #  h  ->  ी
   ('aq', '\u0941\u0902'),  #  aq  ->   ुं
   ('q', '\u0941'),  #  q  ->  ु
   ('aw', '\u0942\u0902'),  #  aw  ->  ूं
   ('\xa1w', '\u0942\u0901'),  #  ¡w  ->  ूँ
   ('w', '\u0942'),  #  w  ->  ू
   ('`', '\u0943'),  #  `  ->  ृ
   ('\u0300', '\u0943'),  #  ̀  ->  ृ
   ('as', '\u0947\u0902'),  #  as  ->  ें
   ('\xb1s', 's\xb1'), #  ±s  ->  s±
   ('s', '\u0947'),  #  s  ->  े
   ('aS', '\u0948\u0902'),  #  aS  ->  ैं
   ('S', '\u0948'),  #  S  ->  ै
   ('a\xaa', '\u094d\u0930\u0902'), #  aª  ->  ्र + ं
   ('\xaa', '\u094d\u0930'), #  ª  ->  ्र
   ('fa', '\u0902f'),  #  fa  ->  ं  + f
   ('a', '\u0902'),  #  a  ->  ं
   ('\xa1', '\u0901'),  #  ¡  ->  ँ
   ('%', ':'),  #  %  ->  :
   ('W', '\u0945'),  #  W  ->  ॅ
   ('\u2022', '\u093d'),  #  •  ->  ऽ
   ('\xb7', '\u093d'),  #  ·  ->  ऽ
   ('\u2219', '\u093d'),  #  ∙  ->  ऽ
   ('\xb7', '\u093d'),  #  ·  ->  ऽ
   ('~j', '\u094d\u0930'),  #  ~j  ->  ्र
   ('~', '\u094d'),  #  ~  ->  ्
   ('\\', '?'),  #  \  ->  ?
   ('+', '\u093c'),  #  +  ->  ़
   ('^', '\u2018'),  #  ^  ->  ‘
   ('*', '\u2019'),  #  *  ->  ’
   ('\xde', '\u201c'),  #  Þ  ->  “
   ('\xdf', '\u201d'),  #  ß  ->  ”
   ('(', ';'),  #  (  ->  ;
   ('\xbc', '('),  #  ¼  ->  (
   ('\xbd', ')'),  #  ½  ->  )
   ('\xbf', '{'),  #  ¿  ->  {
   ('\xc0', '}'),  #  À  ->  }
   ('\xbe', '='),  #  ¾  ->  =
   ('A', '\u0964'),  #  A  ->  ।
   ('-', '.'),  #  -  ->  .
   ('&', '-'),  #  &  ->  -
   ('&', '\xb5'),  #  &  ->  µ
   ('\u03bc', '-'),  #  μ  ->  -
   ('\u0152', '\u0970'),  #  Œ  ->  ॰
   (']', ','),  #  ]  ->  ,
   ('~ ', '\u094d '),  #  ~  ->  ् 
   ('@', '/'),  #  @  ->  /
   ('\xae', '\u0948\u0902'), #  ®  ->  ैं
#   ('%', '\u0903'),  #  %  ->  ः
#   (' \u0903', ':'),  #   ः  ->  :
#   ('\xc7', '\u093f\u0902'), #  Ç  ->  िं
#   ('\xca', '\u0940Z'), #  Ê  ->  ीZ
#   ('Z', '\u0930\u094d'), #  Z  ->  र्
#   ('f', '\u093f'), #  f  ->  ि
#   ('\xb1', 'Z\u0902'), #  ±  ->  Zं
#   ('\xc6', '\u0930\u094d\u093f'), #  Æ  ->  र्ि
#   ('\xc9', '\u0930\u094d\u093f\u0902'),  #  É  ->  र्ि'
]

def convert_text(text):
    for src, tgt in k2u:
        text = text.replace(src, tgt)
    return text

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text

def convert_pdf(pdf_path, output_path):
    text = extract_text_from_pdf(pdf_path)
    converted = convert_text(text)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(converted)

if __name__ == "__main__":
    import sys
    if len(sys.argv) != 3:
        print(f"Usage: python {sys.argv[0]} input.pdf output.txt")
    else:
        convert_pdf(sys.argv[1], sys.argv[2])