Why Does re.sub() Not Work in Python 3.6?

  ascii, non-ascii-characters, python, re, regex

I’m working on a project where I have to read data from an Excel spreadsheet. I decided to use Python because I’m used to it and using regular expressions as well; however, I noticed something recently: Somehow when I use "re.sub()" the characters in the original string are not replaced. This is not the first time I use "re.sub()," I have used it before in older versions of Python (I think version 2.7, I’m currently using version 3.6).

When I use "string.replace()" the characters from the original string get replaced, but not when I’m using "re.sub()."
I’m wondering if I’m doing something wrong.
Could anyone please check this on your end?

Note: I have to use Python 3.6, because there are some libraries I use in the project that are not supported in more recent versions.

Technical Details:

This is what I have so far, you can download the script and a sample test here:

def sanitizeLines(string):
    #all illegal space characters
    string = re.sub(u'([u2000-u206f])', " ", string)
    string = re.sub(u'(u00a0)', " ", string)
    string = re.sub(u'(u1680)', " ", string)
    string = re.sub(u'(u180e)', " ", string)
    string = re.sub(u'(ufeff)', " ", string)
    string = re.sub(u'(u00ad)', " ", string)

    #all illegal control characters
    string = re.sub(u'([u0004-u0007])', " ", string)
    string = re.sub(u'(u0081)', " ", string)
    string = re.sub(u'(u008d)', " ", string)
    
    #emojis
    string = re.sub(u'(u2714)', " ", string)
    string = re.sub(u'([u1f980-u1f984])', " ", string)
    string = re.sub(u'([u1f910-u1f918])', " ", string)
    string = re.sub(u'([u1f973-u1f976])', " ", string)
    string = re.sub(u'([u26a0-u26a1])', " ", string)

    """
    #all illegal ASCII characters
    string = re.sub(u'(ueed1)', " ", string)
    string = re.sub(u'(u0082)', " ", string)
    string = re.sub(u'(u0089)', " ", string)
    string = re.sub(u'([u0091-u0094])', " ", string)
    string = re.sub(u'([u0096-u0097])', " ", string)
    string = re.sub(u'(u009a)', " ", string)
    string = re.sub(u'(u0161)', " ", string)
    string = re.sub(u'([u201c-u201d])', " ", string)
    string = re.sub(u'([u2018-u2019])', " ", string) 
    string = re.sub(u'(u00a4)', " ", string)
    string = re.sub(u'([u00a6-u00a8])', " ", string)
    string = re.sub(u'([u00aa-u00ac])', " ", string)
    string = re.sub(u'([u00af-u00be])', " ", string)
    string = re.sub(u'([u00c4-u00c6])', " ", string)
    string = re.sub(u'(u00cb)', " ", string)
    string = re.sub(u'([u00cf-u00d0])', " ", string)
    string = re.sub(u'([u00d6-u00d8])', " ", string)
    string = re.sub(u'([u00dd-u00df])', " ", string)
    string = re.sub(u'([u00e4-u00e6])', " ", string)
    string = re.sub(u'(u00eb)', " ", string)
    string = re.sub(u'([u00ef-u00f0])', " ", string)
    string = re.sub(u'([u00f6-u00f8])', " ", string)
    string = re.sub(u'([u00fd-u00ff])', " ", string)
    string = re.sub(u'(u0131)', " ", string)
    string = re.sub(u'(u0192)', " ", string)
    string = re.sub(u'(u2017)', " ", string)
    string = re.sub(u'(u2261)', " ", string)
    string = re.sub(u'(u2500)', " ", string)
    string = re.sub(u'(u2502)', " ", string)
    string = re.sub(u'(u250c)', " ", string)
    string = re.sub(u'(u2510)', " ", string)
    string = re.sub(u'(u2514)', " ", string)
    string = re.sub(u'(u2518)', " ", string)
    string = re.sub(u'(u251c)', " ", string)
    string = re.sub(u'(u252c)', " ", string)
    string = re.sub(u'(u2534)', " ", string)
    string = re.sub(u'(u253c)', " ", string)
    string = re.sub(u'([u2550-u2551])', " ", string)
    string = re.sub(u'(u2554)', " ", string)
    string = re.sub(u'(u2557)', " ", string)
    string = re.sub(u'(u255a)', " ", string)
    string = re.sub(u'(u255d)', " ", string)
    string = re.sub(u'(u2560)', " ", string)
    string = re.sub(u'(u2563)', " ", string)
    string = re.sub(u'(u2566)', " ", string)
    string = re.sub(u'(u2569)', " ", string)
    string = re.sub(u'(u256c)', " ", string)
    string = re.sub(u'(u2580)', " ", string)
    string = re.sub(u'(u2584)', " ", string)
    string = re.sub(u'(u2588)', " ", string)
    string = re.sub(u'([u2591-u2593])', " ", string)
    string = re.sub(u'(u25a0)', " ", string)
    string = re.sub(u'(u3002)', " ", string)
    """

def normalizeLines(string):
    #"""
    #all illegal ASCII characters
    string = string.replace("", " ") #ueed1
    string = string.replace("‰", " ") #u0089
    string = string.replace("‚", " ") #u0082
    string = string.replace("“", """)
    string = string.replace("”", """) #u0093-u0094 or u201c-u201d
    string = string.replace("‘", "'")
    string = string.replace("’", "'") #u0091-u0092 or u2018-u2019
    string = string.replace("–", "-") #u0096
    string = string.replace("—", "-") #u0097
    string = string.replace("š", " ") #u009a or u0161
    string = string.replace("¤", " ") #u00a4
    string = string.replace("¦", " ") #u00a6
    string = string.replace("§", " ") #u00a7
    string = string.replace("¨", " ") #u00a8
    string = string.replace("ª", " ") #u00aa
    string = string.replace("«", " ") #u00ab
    string = string.replace("¬", " ") #u00ac
    string = string.replace("¯", " ") #u00af
    string = string.replace("°", " ") #u00b0
    string = string.replace("±", " ") #u00b1
    string = string.replace("²", " ") #u00b2
    string = string.replace("³", " ") #u00b3
    string = string.replace("´", " ") #u00b4
    string = string.replace("µ", " ") #u00b5
    string = string.replace("¶", " ") #u00b6
    string = string.replace("·", " ") #u00b7
    string = string.replace("¸", " ") #u00b8
    string = string.replace("¹", " ") #u00b9
    string = string.replace("º", " ") #u00ba
    string = string.replace("»", " ") #u00bb
    string = string.replace("¼", " ") #u00bc
    string = string.replace("½", " ") #u00bd
    string = string.replace("¾", " ") #u00be
    string = string.replace("Ä", " ") #u00c4
    string = string.replace("Å", " ") #u00c5
    string = string.replace("Æ", " ") #u00c6
    string = string.replace("Ë", " ") #u00cb
    string = string.replace("Ï", " ") #u00cf
    string = string.replace("Ð", " ") #u00d0
    string = string.replace("Ö", " ") #u00d6
    string = string.replace("×", " ") #u00d7
    string = string.replace("Ø", " ") #u00d8
    string = string.replace("Ý", " ") #u00dd
    string = string.replace("Þ", " ") #u00de
    string = string.replace("ß", " ") #u00df
    string = string.replace("ä", " ") #u00e4
    string = string.replace("å", " ") #u00e5
    string = string.replace("æ", " ") #u00e6
    string = string.replace("ë", " ") #u00eb
    string = string.replace("ï", " ") #u00ef
    string = string.replace("ð", " ") #u00f0
    string = string.replace("ö", " ") #u00f6
    string = string.replace("÷", " ") #u00f7
    string = string.replace("ø", " ") #u00f8
    string = string.replace("ý", " ") #u00fd
    string = string.replace("þ", " ") #u00fe
    string = string.replace("ÿ", " ") #u00ff
    string = string.replace("ı", " ") #u0131
    string = string.replace("ƒ", " ") #u0083 or u0192
    string = string.replace("‗", " ") #u2017
    string = string.replace("≡", " ") #u2261
    string = string.replace("─", " ") #u2500
    string = string.replace("│", " ") #u2502
    string = string.replace("┌", " ") #u250c
    string = string.replace("┐", " ") #u2510
    string = string.replace("└", " ") #u2514
    string = string.replace("┘", " ") #u2518
    string = string.replace("├", " ") #u251c
    string = string.replace("┬", " ") #u252c
    string = string.replace("┴", " ") #u2534
    string = string.replace("┼", " ") #u253c
    string = string.replace("═", " ") #u2550
    string = string.replace("║", " ") #u2551
    string = string.replace("╔", " ") #u2554
    string = string.replace("╗", " ") #u2557
    string = string.replace("╚", " ") #u255a
    string = string.replace("╝", " ") #u255d
    string = string.replace("╠", " ") #u2560
    string = string.replace("╣", " ") #u2563
    string = string.replace("╦", " ") #u2566
    string = string.replace("╩", " ") #u2569
    string = string.replace("╬", " ") #u256c
    string = string.replace("▀", " ") #u2580
    string = string.replace("▄", " ") #u2584
    string = string.replace("█", " ") #u2588
    string = string.replace("░", " ") #u2591
    string = string.replace("▒", " ") #u2592
    string = string.replace("▓", " ") #u2593
    string = string.replace("■", " ") #u25a0
    string = string.replace("。", " ") #u3002
    #"""

    string = string.replace("|", " ")
    string = string.replace("t", " ")

    #remove all the new lines from the string
    newString = ' '.join(string.splitlines())

    #replace white spaces with a single space
    myNewString = ' '.join(newString.split())

    return myNewString

Steps to reproduce the issue:

  • Run the script as is, you will notice the only lines that did not match the regex are the ones with emojis.
  • Delete the generated folders.
  • Open up the script and uncomment the "re.sub()" lines and comment out
    the "string.replace()" lines.
  • Save and run the script again.
  • This time you will notice there are more lines that did not match the regex.

Source: Python Questions

LEAVE A COMMENT