lightproof_impl_en.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. # -*- encoding: UTF-8 -*-
  2. import uno, re, sys, os, traceback
  3. from com.sun.star.text.TextMarkupType import PROOFREADING
  4. from com.sun.star.beans import PropertyValue
  5. pkg = "en"
  6. lang = "en"
  7. locales = {'en-GB': ['en', 'GB', ''], 'en-ZW': ['en', 'ZW', ''], 'en-PH': ['en', 'PH', ''], 'en-TT': ['en', 'TT', ''], 'en-BZ': ['en', 'BZ', ''], 'en-NA': ['en', 'NA', ''], 'en-IE': ['en', 'IE', ''], 'en-GH': ['en', 'GH', ''], 'en-US': ['en', 'US', ''], 'en-IN': ['en', 'IN', ''], 'en-BS': ['en', 'BS', ''], 'en-JM': ['en', 'JM', ''], 'en-AU': ['en', 'AU', ''], 'en-NZ': ['en', 'NZ', ''], 'en-ZA': ['en', 'ZA', ''], 'en-CA': ['en', 'CA', '']}
  8. version = "0.4.3"
  9. author = "László Németh"
  10. name = "Lightproof grammar checker (English)"
  11. import lightproof_handler_en
  12. # loaded rules (check for Update mechanism of the editor)
  13. try:
  14. langrule
  15. except NameError:
  16. langrule = {}
  17. # ignored rules
  18. ignore = {}
  19. # cache for morphogical analyses
  20. analyses = {}
  21. stems = {}
  22. suggestions = {}
  23. # assign Calc functions
  24. calcfunc = None
  25. # check settings
  26. def option(lang, opt):
  27. return lightproof_handler_en.get_option(lang.Language + "_" + lang.Country, opt)
  28. # filtering affix fields (ds, is, ts etc.)
  29. def onlymorph(st):
  30. if st != None:
  31. st = re.sub(r"^.*(st:|po:)", r"\\1", st) # keep last word part
  32. st = re.sub(r"\\b(?=[dit][sp]:)","@", st) # and its affixes
  33. st = re.sub(r"(?<!@)\\b\w\w:\w+","", st).replace('@','').strip()
  34. return st
  35. # if the pattern matches all analyses of the input word,
  36. # return the last matched substring
  37. def _morph(rLoc, word, pattern, all, onlyaffix):
  38. global analyses
  39. if not word:
  40. return None
  41. if word not in analyses:
  42. x = spellchecker.spell(u"<?xml?><query type='analyze'><word>" + word + "</word></query>", rLoc, ())
  43. if not x:
  44. return None
  45. t = x.getAlternatives()
  46. if not t:
  47. t = [""]
  48. analyses[word] = t[0].split("</a>")[:-1]
  49. a = analyses[word]
  50. result = None
  51. p = re.compile(pattern)
  52. for i in a:
  53. if onlyaffix:
  54. i = onlymorph(i)
  55. result = p.search(i)
  56. if result:
  57. result = result.group(0)
  58. if not all:
  59. return result
  60. elif all:
  61. return None
  62. return result
  63. def morph(rLoc, word, pattern, all=True):
  64. return _morph(rLoc, word, pattern, all, False)
  65. def affix(rLoc, word, pattern, all=True):
  66. return _morph(rLoc, word, pattern, all, True)
  67. def spell(rLoc, word):
  68. if not word:
  69. return None
  70. return spellchecker.isValid(word, rLoc, ())
  71. # get the tuple of the stem of the word or an empty array
  72. def stem(rLoc, word):
  73. global stems
  74. if not word:
  75. return []
  76. if not word in stems:
  77. x = spellchecker.spell(u"<?xml?><query type='stem'><word>" + word + "</word></query>", rLoc, ())
  78. if not x:
  79. return []
  80. t = x.getAlternatives()
  81. if not t:
  82. t = []
  83. stems[word] = list(t)
  84. return stems[word]
  85. # get the tuple of the morphological generation of a word or an empty array
  86. def generate(rLoc, word, example):
  87. if not word:
  88. return []
  89. x = spellchecker.spell(u"<?xml?><query type='generate'><word>" + word + "</word><word>" + example + "</word></query>", rLoc, ())
  90. if not x:
  91. return []
  92. t = x.getAlternatives()
  93. if not t:
  94. t = []
  95. return list(t)
  96. # get suggestions
  97. def suggest(rLoc, word):
  98. global suggestions
  99. if not word:
  100. return word
  101. if word not in suggestions:
  102. x = spellchecker.spell("_" + word, rLoc, ())
  103. if not x:
  104. return word
  105. t = x.getAlternatives()
  106. suggestions[word] = "\\n".join(t)
  107. return suggestions[word]
  108. # get the nth word of the input string or None
  109. def word(s, n):
  110. a = re.match("(?u)( [-.\\w%%]+){" + str(n-1) + "}( [-.\\w%%]+)", s)
  111. if not a:
  112. return ''
  113. return a.group(2)[1:]
  114. # get the (-)nth word of the input string or None
  115. def wordmin(s, n):
  116. a = re.search("(?u)([-.\\w%%]+ )([-.\\w%%]+ ){" + str(n-1) + "}$", s)
  117. if not a:
  118. return ''
  119. return a.group(1)[:-1]
  120. def calc(funcname, par):
  121. global calcfunc
  122. global SMGR
  123. if calcfunc == None:
  124. calcfunc = SMGR.createInstance( "com.sun.star.sheet.FunctionAccess")
  125. if calcfunc == None:
  126. return None
  127. return calcfunc.callFunction(funcname, par)
  128. def proofread( nDocId, TEXT, LOCALE, nStartOfSentencePos, nSuggestedSentenceEndPos, rProperties ):
  129. global ignore
  130. aErrs = []
  131. s = TEXT[nStartOfSentencePos:nSuggestedSentenceEndPos]
  132. for i in get_rule(LOCALE).dic:
  133. # 0: regex, 1: replacement, 2: message, 3: condition, 4: ngroup, (5: oldline), 6: case sensitive ?
  134. if i[0] and not str(i[0]) in ignore:
  135. for m in i[0].finditer(s):
  136. try:
  137. if not i[3] or eval(i[3]):
  138. aErr = uno.createUnoStruct( "com.sun.star.linguistic2.SingleProofreadingError" )
  139. aErr.nErrorStart = nStartOfSentencePos + m.start(i[4]) # nStartOfSentencePos
  140. aErr.nErrorLength = m.end(i[4]) - m.start(i[4])
  141. aErr.nErrorType = PROOFREADING
  142. aErr.aRuleIdentifier = str(i[0])
  143. iscap = (i[-1] and m.group(i[4])[0:1].isupper())
  144. if i[1][0:1] == "=":
  145. aErr.aSuggestions = tuple(cap(eval(i[1][1:]).replace('|', "\n").split("\n"), iscap, LOCALE))
  146. elif i[1] == "_":
  147. aErr.aSuggestions = ()
  148. else:
  149. aErr.aSuggestions = tuple(cap(m.expand(i[1]).replace('|', "\n").split("\n"), iscap, LOCALE))
  150. comment = i[2]
  151. if comment[0:1] == "=":
  152. comment = eval(comment[1:])
  153. else:
  154. comment = m.expand(comment)
  155. aErr.aShortComment = comment.replace('|', '\n').replace('\\n', '\n').split("\n")[0].strip()
  156. aErr.aFullComment = comment.replace('|', '\n').replace('\\n', '\n').split("\n")[-1].strip()
  157. if "://" in aErr.aFullComment:
  158. p = PropertyValue()
  159. p.Name = "FullCommentURL"
  160. p.Value = aErr.aFullComment
  161. aErr.aFullComment = aErr.aShortComment
  162. aErr.aProperties = (p,)
  163. else:
  164. aErr.aProperties = ()
  165. aErrs = aErrs + [aErr]
  166. except Exception as e:
  167. if len(i) == 7:
  168. raise Exception(str(e), i[5])
  169. raise
  170. return tuple(aErrs)
  171. def cap(a, iscap, rLoc):
  172. if iscap:
  173. for i in range(0, len(a)):
  174. if a[i][0:1] == "i":
  175. if rLoc.Language == "tr" or rLoc.Language == "az":
  176. a[i] = u"\u0130" + a[i][1:]
  177. elif a[i][1:2] == "j" and rLoc.Language == "nl":
  178. a[i] = "IJ" + a[i][2:]
  179. else:
  180. a[i] = "I" + a[i][1:]
  181. else:
  182. a[i] = a[i].capitalize()
  183. return a
  184. def compile_rules(dic):
  185. # compile regular expressions
  186. for i in dic:
  187. try:
  188. if re.compile("[(][?]iu[)]").match(i[0]):
  189. i += [True]
  190. i[0] = re.sub("[(][?]iu[)]", "(?u)", i[0])
  191. else:
  192. i += [False]
  193. i[0] = re.compile(i[0])
  194. except:
  195. if 'PYUNO_LOGLEVEL' in os.environ:
  196. print("Lightproof: bad regular expression: ", traceback.format_exc())
  197. i[0] = None
  198. def get_rule(loc):
  199. try:
  200. return langrule[pkg]
  201. except:
  202. langrule[pkg] = __import__("lightproof_" + pkg)
  203. compile_rules(langrule[pkg].dic)
  204. return langrule[pkg]
  205. def get_path():
  206. return os.path.join(os.path.dirname(sys.modules[__name__].__file__), __name__ + ".py")
  207. # [code]
  208. # pattern matching for common English abbreviations
  209. abbrev = re.compile("(?i)\\b([a-z]|acct|approx|appt|apr|apt|assoc|asst|aug|ave|avg|co(nt|rp)?|ct|dec|defn|dept|dr|eg|equip|esp|est|etc|excl|ext|feb|fri|ft|govt?|hrs?|ib(id)?|ie|in(c|t)?|jan|jr|jul|lit|ln|mar|max|mi(n|sc)?|mon|Mrs?|mun|natl?|neg?|no(rm|s|v)?|nw|obj|oct|org|orig|pl|pos|prev|proj|psi|qty|rd|rec|rel|reqd?|resp|rev|sat|sci|se(p|pt)?|spec(if)?|sq|sr|st|subj|sun|sw|temp|thurs|tot|tues|univ|var|vs)\\.")
  210. # pattern for paragraph checking
  211. paralcap = re.compile(u"(?u)^[a-z].*[.?!] [A-Z].*[.?!][)\u201d]?$")
  212. punct = { "?": "question mark", "!": "exclamation mark",
  213. ",": "comma", ":": "colon", ";": "semicolon",
  214. "(": "opening parenthesis", ")": "closing parenthesis",
  215. "[": "opening square bracket", "]": "closing square bracket",
  216. u"\u201c": "opening quotation mark", u"\u201d": "closing quotation mark"}
  217. aA = set(["eucalypti", "eucalyptus", "Eucharist", "Eucharistic",
  218. "euchre", "euchred", "euchring", "Euclid", "euclidean", "Eudora",
  219. "eugene", "Eugenia", "eugenic", "eugenically", "eugenicist",
  220. "eugenicists", "eugenics", "Eugenio", "eukaryote", "Eula", "eulogies",
  221. "eulogist", "eulogists", "eulogistic", "eulogized", "eulogizer",
  222. "eulogizers", "eulogizing", "eulogy", "eulogies", "Eunice", "eunuch",
  223. "eunuchs", "Euphemia", "euphemism", "euphemisms", "euphemist",
  224. "euphemists", "euphemistic", "euphemistically", "euphonious",
  225. "euphoniously", "euphonium", "euphony", "euphoria", "euphoric",
  226. "Euphrates", "euphuism", "Eurasia", "Eurasian", "Eurasians", "eureka",
  227. "eurekas", "eurhythmic", "eurhythmy", "Euridyce", "Euripides", "euripus",
  228. "Euro", "Eurocentric", "Euroclydon", "Eurocommunism", "Eurocrat",
  229. "eurodollar", "Eurodollar", "Eurodollars", "Euromarket", "Europa",
  230. "Europe", "European", "Europeanisation", "Europeanise", "Europeanised",
  231. "Europeanization", "Europeanize", "Europeanized", "Europeans", "europium",
  232. "Eurovision", "Eustace", "Eustachian", "Eustacia", "euthanasia",
  233. "Ewart", "ewe", "Ewell", "ewer", "ewers", "Ewing", "once", "one",
  234. "oneness", "ones", "oneself", "onetime", "oneway", "oneyear", "u",
  235. "U", "UART", "ubiquitous", "ubiquity", "Udale", "Udall", "UEFA",
  236. "Uganda", "Ugandan", "ugric", "UK", "ukase", "Ukraine", "Ukrainian",
  237. "Ukrainians", "ukulele", "Ula", "ululated", "ululation", "Ulysses",
  238. "UN", "unanimity", "unanimous", "unanimously", "unary", "Unesco",
  239. "UNESCO", "UNHCR", "uni", "unicameral", "unicameralism", "Unicef",
  240. "UNICEF", "unicellular", "Unicode", "unicorn", "unicorns", "unicycle",
  241. "unicyclist", "unicyclists", "unidimensional", "unidirectional",
  242. "unidirectionality", "unifiable", "unification", "unified", "unifier",
  243. "unifilar", "uniform", "uniformally", "uniformed", "uniformer",
  244. "uniforming", "uniformisation", "uniformise", "uniformitarian",
  245. "uniformitarianism", "uniformity", "uniformly", "uniformness", "uniforms",
  246. "unify", "unifying", "unijugate", "unilateral", "unilateralisation",
  247. "unilateralise", "unilateralism", "unilateralist", "unilaterally",
  248. "unilinear", "unilingual", "uniliteral", "uniliteralism", "uniliteralist",
  249. "unimodal", "union", "unionism", "unionist", "unionists", "unionisation",
  250. "unionise", "unionised", "unionising", "unionization", "unionize",
  251. "unionized", "unionizing", "unions", "unipolar", "uniprocessor",
  252. "unique", "uniquely", "uniqueness", "uniquer", "Uniroyal", "unisex",
  253. "unison", "Unisys", "unit", "Unitarian", "Unitarianism", "Unitarians",
  254. "unitary", "unite", "united", "unitedly", "uniter", "unites", "uniting",
  255. "unitize", "unitizing", "unitless", "units", "unity", "univ", "Univac",
  256. "univalent", "univalve", "univariate", "universal", "universalisation",
  257. "universalise", "universalised", "universaliser", "universalisers",
  258. "universalising", "universalism", "universalist", "universalistic",
  259. "universality", "universalisation", "universalization", "universalize",
  260. "universalized", "universalizer", "universalizers", "universalizing",
  261. "universally", "universalness", "universe", "universes", "universities",
  262. "university", "univocal", "Unix", "uracil", "Urals", "uranium", "Uranus",
  263. "uranyl", "urate", "urea", "uremia", "uremic", "ureter", "urethane",
  264. "urethra", "urethral", "urethritis", "Urey", "Uri", "uric", "urinal",
  265. "urinalysis", "urinary", "urinated", "urinating", "urination", "urine",
  266. "urogenital", "urokinase", "urologist", "urologists", "urology",
  267. "Uruguay", "Uruguayan", "Uruguayans", "US", "USA", "usability",
  268. "usable", "usably", "usage",
  269. "usages", "use", "used", "useful", "usefulness", "usefully", "useless",
  270. "uselessly", "uselessness", "Usenet", "user", "users", "uses", "using",
  271. "usual", "usually", "usurer", "usurers", "usuress", "usurial", "usurious",
  272. "usurp", "usurpation", "usurped", "usurper", "usurping", "usurps",
  273. "usury", "Utah", "utensil", "utensils", "uterine", "uterus", "Utica",
  274. "utilitarian", "utilitarianism", "utilities", "utility", "utilizable",
  275. "utilization", "utilize", "utilized", "utilizes", "utilizing", "utopia",
  276. "utopian", "utopians", "utopias", "Utrecht", "Uttoxeter", "uvula",
  277. "uvular"])
  278. aAN = set(["f", "F", "FBI", "FDA", "heir", "heirdom", "heired",
  279. "heirer", "heiress", "heiring", "heirloom", "heirship", "honest",
  280. "honester", "honestly", "honesty", "honor", "honorable", "honorableness",
  281. "honorably", "honorarium", "honorary", "honored", "honorer", "honorific",
  282. "honoring", "honors", "honour", "honourable", "honourableness",
  283. "honourably", "honourarium", "honourary", "honoured", "honourer",
  284. "honourific", "honouring", "Honours", "hors", "hour", "hourglass", "hourlong",
  285. "hourly", "hours", "l", "L", "LCD", "m", "M", "MBA", "MP", "mpg", "mph",
  286. "MRI", "MSc", "MTV", "n", "N", "NBA", "NBC", "NFL", "NGO", "NHL", "r",
  287. "R", "s", "S", "SMS", "sos", "SOS", "SPF", "std", "STD", "SUV", "x",
  288. "X", "XML"])
  289. aB = set(["H", "habitual", "hallucination", "haute", "hauteur", "herb", "herbaceous", "herbal",
  290. "herbalist", "herbalism", "heroic", "hilarious", "historian", "historic", "historical",
  291. "homage", "homophone", "horrendous", "hospitable", "horrific", "hotel", "hypothesis", "Xmas"])
  292. def measurement(mnum, min, mout, mstr, decimal, remove):
  293. if min == "ft" or min == "in" or min == "mi":
  294. mnum = mnum.replace(" 1/2", ".5").replace(u" \xbd", ".5").replace(u"\xbd",".5")
  295. m = calc("CONVERT_ADD", (float(eval(mnum.replace(remove, "").replace(decimal, ".").replace(u"\u2212", "-"))), min, mout))
  296. a = list(set([str(calc("ROUND", (m, 0)))[:-2], str(calc("ROUND", (m, 1))), str(calc("ROUND", (m, 2))), str(m)])) # remove duplicated rounded items
  297. a.sort(key=lambda x: len(x)) # sort by string length
  298. return (mstr + "\n").join(a).replace(".", decimal).replace("-", u"\u2212") + mstr